diff --git a/CHANGES b/CHANGES
index 0c48a3dc0..5d1cd1082 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,4 +1,20 @@
 LOG:
+Version 4.2.0 vs 4.1.0 
+- Added AccelWattch power model v1.0 which replaces GPUWattch. 
+- Added AccelWattch XML configuration files for SM7_QV100, SM7_TITANV, SM75_RTX2060_S, SM6_TITANX. Note that all these AccelWattch XML configuration files are tuned only for SM7_QV100. 
+
+Version 4.1.0 versus 4.0.0
+-Features:
+1- Supporting L1 write-allocate with sub-sector writing policy as in Volta+ hardware, and changing the Volta+ cards config to make L1 write-allocate with write-through
+2- Making the L1 adaptive cache policy to be configurable 
+3- Adding Ampere RTX 3060 config files
+-Bugs:
+1- Fixing L1 bank hash function bug
+2- Fixing L1 read hit counters in gpgpu-sim to match nvprof, to achieve more accurate L1 correlation with the HW
+3- Fixing bugs in lazy write handling, thanks to Gwendolyn Voskuilen from Sandia labs for this fix
+4- Fixing the backend pipeline for sub_core model 
+5- Fixing Memory stomp bug at the shader_config
+6- Some code refactoring:
 Version 4.0.0 (development branch) versus 3.2.3
 -Front-End:
 1- Support .nc cache modifier and __ldg function to access the read-only L1D cache
diff --git a/COPYRIGHT b/COPYRIGHT
index a4eea2915..1c949f93e 100644
--- a/COPYRIGHT
+++ b/COPYRIGHT
@@ -44,3 +44,33 @@ per UBC policy 88, item 2.3 on literary works) these students names appear in
 the copyright notices of the respective files. UBC is also mentioned in the 
 copyright notice to highlight that was the author's affiliation when the work 
 was performed.
+
+NOTE 3: AccelWattch and all its components are covered by the following license and copyright.
+Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/Makefile b/Makefile
index d248211cd..82ea39928 100644
--- a/Makefile
+++ b/Makefile
@@ -87,7 +87,7 @@ ifneq ($(GPGPUSIM_POWER_MODEL),)
 		MCPAT_DBG_FLAG = dbg
 	endif
 
-	MCPAT_OBJ_DIR = $(SIM_OBJ_FILES_DIR)/gpuwattch
+	MCPAT_OBJ_DIR = $(SIM_OBJ_FILES_DIR)/accelwattch
 
 	MCPAT = $(MCPAT_OBJ_DIR)/*.o
 endif
@@ -117,24 +117,24 @@ check_setup_environment:
 	 fi 
 
 check_power:
-	@if [ -d "$(GPGPUSIM_ROOT)/src/gpuwattch/" -a ! -n "$(GPGPUSIM_POWER_MODEL)" ]; then \
+	@if [ -d "$(GPGPUSIM_ROOT)/src/accelwattch/" -a ! -n "$(GPGPUSIM_POWER_MODEL)" ]; then \
 		echo ""; \
-		echo "	Power model detected in default directory ($(GPGPUSIM_ROOT)/src/gpuwattch) but GPGPUSIM_POWER_MODEL not set."; \
-		echo "	Please re-run setup_environment or manually set GPGPUSIM_POWER_MODEL to the gpuwattch directory if you would like to include the GPGPU-Sim Power Model."; \
+		echo "	Power model detected in default directory ($(GPGPUSIM_ROOT)/src/accelwattch) but GPGPUSIM_POWER_MODEL not set."; \
+		echo "	Please re-run setup_environment or manually set GPGPUSIM_POWER_MODEL to the accelwattch directory if you would like to include the GPGPU-Sim Power Model."; \
 		echo ""; \
 		true; \
 	elif [ ! -d "$(GPGPUSIM_POWER_MODEL)" ]; then \
 		echo ""; \
 		echo "ERROR ** Power model directory invalid."; \
 		echo "($(GPGPUSIM_POWER_MODEL)) is not a valid directory."; \
-		echo "Please set GPGPUSIM_POWER_MODEL to the GPGPU-Sim gpuwattch directory."; \
+		echo "Please set GPGPUSIM_POWER_MODEL to the GPGPU-Sim accelwattch directory."; \
 		echo ""; \
 		exit 101; \
 	elif [ -n "$(GPGPUSIM_POWER_MODEL)" -a ! -f "$(GPGPUSIM_POWER_MODEL)/gpgpu_sim.verify" ]; then \
 		echo ""; \
 		echo "ERROR ** Power model directory invalid."; \
 		echo "gpgpu_sim.verify not found in $(GPGPUSIM_POWER_MODEL)."; \
-		echo "Please ensure that GPGPUSIM_POWER_MODEL points to a valid gpuwattch directory and that you have the correct GPGPU-Sim mcpat distribution."; \
+		echo "Please ensure that GPGPUSIM_POWER_MODEL points to a valid accelwattch directory and that you have the correct GPGPU-Sim mcpat distribution."; \
 		echo ""; \
 		exit 102; \
 	fi
@@ -243,8 +243,8 @@ makedirs:
 	if [ ! -d $(SIM_OBJ_FILES_DIR)/libopencl/bin ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/libopencl/bin; fi;
 	if [ ! -d $(SIM_OBJ_FILES_DIR)/$(INTERSIM) ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/$(INTERSIM); fi;
 	if [ ! -d $(SIM_OBJ_FILES_DIR)/cuobjdump_to_ptxplus ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/cuobjdump_to_ptxplus; fi;
-	if [ ! -d $(SIM_OBJ_FILES_DIR)/gpuwattch ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/gpuwattch; fi;
-	if [ ! -d $(SIM_OBJ_FILES_DIR)/gpuwattch/cacti ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/gpuwattch/cacti; fi;
+	if [ ! -d $(SIM_OBJ_FILES_DIR)/accelwattch ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/accelwattch; fi;
+	if [ ! -d $(SIM_OBJ_FILES_DIR)/accelwattch/cacti ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/accelwattch/cacti; fi;
 
 all:
 	$(MAKE) gpgpusim
diff --git a/README.md b/README.md
index 9f9f6698f..da0893585 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 Welcome to GPGPU-Sim, a cycle-level simulator modeling contemporary graphics
 processing units (GPUs) running GPU computing workloads written in CUDA or
 OpenCL. Also included in GPGPU-Sim is a performance visualization tool called
-AerialVision and a configurable and extensible energy model called GPUWattch.
-GPGPU-Sim and GPUWattch have been rigorously validated with performance and
+AerialVision and a configurable and extensible power model called AccelWattch.
+GPGPU-Sim and AccelWattch have been rigorously validated with performance and
 power measurements of real hardware GPUs.
 
 This version of GPGPU-Sim has been tested with a subset of CUDA version 4.2,
@@ -11,6 +11,11 @@ This version of GPGPU-Sim has been tested with a subset of CUDA version 4.2,
 Please see the copyright notice in the file COPYRIGHT distributed with this
 release in the same directory as this file.
 
+GPGPU-Sim 4.0 is compatible with Accel-Sim simulation framework. With the support 
+of Accel-Sim, GPGPU-Sim 4.0 can run NVIDIA SASS traces (trace-based simulation) 
+generated by NVIDIA's dynamic binary instrumentation tool (NVBit). For more information 
+about Accel-Sim, see [https://accel-sim.github.io/](https://accel-sim.github.io/)
+
 If you use GPGPU-Sim 4.0 in your research, please cite:
 
 Mahmoud Khairy, Zhesheng Shen, Tor M. Aamodt, Timothy G Rogers.
@@ -18,7 +23,7 @@ Accel-Sim: An Extensible Simulation Framework for Validated GPU Modeling.
 In proceedings of the 47th IEEE/ACM International Symposium on Computer Architecture (ISCA),
 May 29 - June 3, 2020.
 
-If you use CuDNN or PyTorch support, checkpointing or our new debugging tool for functional 
+If you use CuDNN or PyTorch support (execution-driven simulation), checkpointing or our new debugging tool for functional 
 simulation errors in GPGPU-Sim for your research, please cite:
 
 Jonathan Lew, Deval Shah, Suchita Pati, Shaylin Cattell, Mengchi Zhang, Amruth Sandhupatla, 
@@ -26,7 +31,6 @@ Christopher Ng, Negar Goli, Matthew D. Sinclair, Timothy G. Rogers, Tor M. Aamod
 Analyzing Machine Learning Workloads Using a Detailed GPU Simulator, arXiv:1811.08933,
 https://arxiv.org/abs/1811.08933
 
-
 If you use the Tensor Core model in GPGPU-Sim or GPGPU-Sim's CUTLASS Library 
 for your research please cite:
 
@@ -34,12 +38,11 @@ Md Aamir Raihan, Negar Goli, Tor Aamodt,
 Modeling Deep Learning Accelerator Enabled GPUs, arXiv:1811.08309, 
 https://arxiv.org/abs/1811.08309
 
-If you use the GPUWattch energy model in your research, please cite:
+If you use the AccelWattch power model in your research, please cite:
 
-Jingwen Leng, Tayler Hetherington, Ahmed ElTantawy, Syed Gilani, Nam Sung Kim,
-Tor M. Aamodt, Vijay Janapa Reddi, GPUWattch: Enabling Energy Optimizations in
-GPGPUs, In proceedings of the ACM/IEEE International Symposium on Computer
-Architecture (ISCA 2013), Tel-Aviv, Israel, June 23-27, 2013.
+Vijay Kandiah, Scott Peverelle, Mahmoud Khairy, Junrui Pan, Amogh Manjunath, Timothy G. Rogers, Tor M. Aamodt, and Nikos Hardavellas. 2021.
+AccelWattch: A Power Modeling Framework for Modern GPUs. In MICRO54: 54th Annual IEEE/ACM International Symposium on Microarchitecture
+(MICRO ’21), October 18–22, 2021, Virtual Event, Greece.
 
 If you use the support for CUDA dynamic parallelism in your research, please cite:
 
@@ -58,8 +61,8 @@ This file contains instructions on installing, building and running GPGPU-Sim.
 Detailed documentation on what GPGPU-Sim models, how to configure it, and a
 guide to the source code can be found here: <http://gpgpu-sim.org/manual/>.
 Instructions for building doxygen source code documentation are included below.
-Detailed documentation on GPUWattch including how to configure it and a guide
-to the source code can be found here: <http://gpgpu-sim.org/gpuwattch/>.
+
+Previous versions of GPGPU-Sim (3.2.0 to 4.1.0) included the [GPUWattch Energy model](http://gpgpu-sim.org/gpuwattch/) which has been replaced by AccelWattch version 1.0 in GPGPU-Sim version 4.2.0. AccelWattch supports modern GPUs and is validated against a NVIDIA Volta QV100 GPU. Detailed documentation on AccelWattch can be found here: [AccelWattch Overview](https://github.com/VijayKandiah/accel-sim-framework#accelwattch-overview) and [AccelWattch MICRO'21 Artifact Manual](https://github.com/VijayKandiah/accel-sim-framework/blob/release/AccelWattch.md).
 
 If you have questions, please sign up for the google groups page (see
 gpgpu-sim.org), but note that use of this simulator does not imply any level of
@@ -104,21 +107,20 @@ library (part of the CUDA toolkit). Code to interface with the CUDA Math
 library is contained in cuda-math.h, which also includes several structures
 derived from vector_types.h (one of the CUDA header files).
 
-## GPUWattch Energy Model
+## AccelWattch Power Model
 
-GPUWattch (introduced in GPGPU-Sim 3.2.0) was developed by researchers at the
-University of British Columbia, the University of Texas at Austin, and the
-University of Wisconsin-Madison. Contributors to GPUWattch include Tor
-Aamodt's research group at the University of British Columbia: Tayler
-Hetherington and Ahmed ElTantawy; Vijay Reddi's research group at the
-University of Texas at Austin: Jingwen Leng; and Nam Sung Kim's research group
-at the University of Wisconsin-Madison: Syed Gilani.
+AccelWattch (introduced in GPGPU-Sim 4.2.0) was developed by researchers at 
+Northwestern University, Purdue University, and the University of British Columbia. 
+Contributors to AccelWattch include Nikos Hardavellas's research group at Northwestern University: 
+Vijay Kandiah; Tor Aamodt's research group at the University of British Columbia: Scott Peverelle; 
+and Timothy Rogers's research group at Purdue University: Mahmoud Khairy, Junrui Pan, and Amogh Manjunath. 
 
-GPUWattch leverages McPAT, which was developed by Sheng Li et al. at the
+AccelWattch leverages McPAT, which was developed by Sheng Li et al. at the
 University of Notre Dame, Hewlett-Packard Labs, Seoul National University, and
-the University of California, San Diego. The paper can be found at
+the University of California, San Diego. The McPAT paper can be found at
 http://www.hpl.hp.com/research/mcpat/micro09.pdf.
 
+
 # INSTALLING, BUILDING and RUNNING GPGPU-Sim
 
 Assuming all dependencies required by GPGPU-Sim are installed on your system,
@@ -261,6 +263,7 @@ To clean the docs run
 The documentation resides at doc/doxygen/html.
 
 To run Pytorch applications with the simulator, install the modified Pytorch library as well by following instructions [here](https://github.com/gpgpu-sim/pytorch-gpgpu-sim).
+
 ## Step 3: Run
 
 Before we run, we need to make sure the application's executable file is dynamically linked to CUDA runtime library. This can be done during compilation of your program by introducing the nvcc flag "--cudart shared" in makefile (quotes should be excluded).
@@ -311,15 +314,16 @@ need to re-compile your application simply to run it on GPGPU-Sim.
 To revert back to running on the hardware, remove GPGPU-Sim from your
 LD_LIBRARY_PATH environment variable.
 
-The following GPGPU-Sim configuration options are used to enable GPUWattch
+The following GPGPU-Sim configuration options are used to enable AccelWattch
 
 	-power_simulation_enabled 1 (1=Enabled, 0=Not enabled)
-	-gpuwattch_xml_file <filename>.xml
-
+	-power_simulation_mode 0 (0=AccelWattch_SASS_SIM or AccelWattch_PTX_SIM, 1=AccelWattch_SASS_HW, 2=AccelWattch_SASS_HYBRID)
+	-accelwattch_xml_file <filename>.xml
 
-The GPUWattch XML configuration file name is set to gpuwattch.xml by default and
-currently only supplied for GTX480 (default=gpuwattch_gtx480.xml). Please refer to
-<http://gpgpu-sim.org/gpuwattch/> for more information.
+The AccelWattch XML configuration file name is set to accelwattch_sass_sim.xml by default and is
+currently provided for SM7_QV100, SM7_TITANV, SM75_RTX2060_S, and SM6_TITANX. 
+Note that all these AccelWattch XML configuration files are tuned only for SM7_QV100. Please refer to
+<https://github.com/VijayKandiah/accel-sim-framework#accelwattch-overview> for more information.
 
 Running OpenCL applications is identical to running CUDA applications. However,
 OpenCL applications need to communicate with the NVIDIA driver in order to
diff --git a/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM6_TITANX/gpgpusim.config b/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
index 5b243a5b6..652f0a09e 100644
--- a/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
+++ b/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
@@ -1,3 +1,32 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
 # This config models the Pascal GP102 (NVIDIA TITAN X)
 # For more info about this card, see Nvidia White paper
 # http://international.download.nvidia.com/geforce-com/international/pdfs/GeForce_GTX_1080_Whitepaper_FINAL.pdf
@@ -28,6 +57,7 @@
 -gpgpu_n_cores_per_cluster 1
 -gpgpu_n_mem 12
 -gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
 
 # Pascal clock domains
 #-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
@@ -170,11 +200,8 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model for Pascal 102
--power_simulation_enabled 0
 
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
 #-trace_sampling_core 0
-
diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index 6fe04eecd..2a9bff015 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -1,8 +1,3 @@
-# This config models the Turing RTX 2060
-# For more info about turing architecture:
-# 1- https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/technologies/turing-architecture/NVIDIA-Turing-Architecture-Whitepaper.pdf
-# 2- "RTX on—The NVIDIA Turing GPU", IEEE MICRO 2020
-
 # functional simulator specification
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
@@ -14,6 +9,7 @@
 -gpgpu_runtime_sync_depth_limit 2
 -gpgpu_runtime_pending_launch_count_limit 2048
 -gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
 
 # Compute Capability
 -gpgpu_compute_capability_major 7
@@ -27,31 +23,27 @@
 -gpgpu_n_clusters 30
 -gpgpu_n_cores_per_cluster 1
 -gpgpu_n_mem 12
--gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_n_sub_partition_per_mchannel 2
 
-# volta clock domains
+# clock domains
 #-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
--gpgpu_clock_domains 1365.0:1365.0:1365.0:3500.0
-# boost mode
-# -gpgpu_clock_domains 1680.0:1680.0:1680.0:3500.0
+-gpgpu_clock_domains 1365:1365:1365:3500.5
 
 # shader core pipeline config
 -gpgpu_shader_registers 65536
 -gpgpu_registers_per_block 65536
 -gpgpu_occupancy_sm_number 75
 
-# This implies a maximum of 32 warps/SM
--gpgpu_shader_core_pipeline 1024:32 
--gpgpu_shader_cta 32
+-gpgpu_shader_core_pipeline 1024:32
+-gpgpu_shader_cta 16
 -gpgpu_simd_model 1
 
 # Pipeline widths and number of FUs
 # ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
-## Turing has 4 SP SIMD units, 4 INT units, 4 SFU units, 8 Tensor core units
-## We need to scale the number of pipeline registers to be equal to the number of SP units
--gpgpu_pipeline_widths 4,0,4,4,4,4,0,4,4,4,8,4,4
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
 -gpgpu_num_sp_units 4
 -gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
 -gpgpu_num_int_units 4
 -gpgpu_tensor_core_avail 1
 -gpgpu_num_tensor_core_units 4
@@ -59,32 +51,18 @@
 # Instruction latencies and initiation intervals
 # "ADD,MAX,MUL,MAD,DIV"
 # All Div operations are executed on SFU unit
--ptx_opcode_latency_int 4,13,4,5,145,32
--ptx_opcode_initiation_int 2,2,2,2,8,4
--ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_latency_int 4,4,4,4,21
+-ptx_opcode_initiation_int 2,2,2,2,2
+-ptx_opcode_latency_fp 4,4,4,4,39
 -ptx_opcode_initiation_fp 2,2,2,2,4
--ptx_opcode_latency_dp 8,19,8,8,330
--ptx_opcode_initiation_dp 4,4,4,4,130
--ptx_opcode_latency_sfu 100
+-ptx_opcode_latency_dp 64,64,64,64,330
+-ptx_opcode_initiation_dp 64,64,64,64,130
+-ptx_opcode_latency_sfu 21
 -ptx_opcode_initiation_sfu 8
 -ptx_opcode_latency_tesnor 64
 -ptx_opcode_initiation_tensor 64
 
-# Turing has four schedulers per core
--gpgpu_num_sched_per_core 4
-# Greedy then oldest scheduler
--gpgpu_scheduler gto
-## In Turing, a warp scheduler can issue 1 inst per cycle
--gpgpu_max_insn_issue_per_warp 1
--gpgpu_dual_issue_diff_exec_units 1
-
-# shared memory bankconflict detection 
--gpgpu_shmem_num_banks 32
--gpgpu_shmem_limited_broadcast 0
--gpgpu_shmem_warp_parts 1
--gpgpu_coalesce_arch 75
-
-# Trung has sub core model, in which each scheduler has its own register file and EUs
+# sub core model: in which each scheduler has its own register file and EUs
 # i.e. schedulers are isolated
 -gpgpu_sub_core_model 1
 # disable specialized operand collectors and use generic operand collectors instead
@@ -92,26 +70,46 @@
 -gpgpu_operand_collector_num_units_gen 8
 -gpgpu_operand_collector_num_in_ports_gen 8
 -gpgpu_operand_collector_num_out_ports_gen 8
-# turing has 8 banks dual-port, 4 schedulers, two banks per scheduler
-# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
--gpgpu_num_reg_banks 16
+# register banks
+-gpgpu_num_reg_banks 8
 -gpgpu_reg_file_port_throughput 2
 
+# warp scheduling
+-gpgpu_num_sched_per_core 4
+-gpgpu_scheduler lrr
+# a warp scheduler issue mode
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
 # <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
--gpgpu_adaptive_cache_config 0
+# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 32,64
+-gpgpu_unified_l1d_size 96
+# L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:512,L:L:s:N:L,A:256:8,16:0,32
--gpgpu_shmem_size 65536
--gpgpu_shmem_sizeDefault 65536
--gpgpu_shmem_per_block 65536
+-gpgpu_cache:dl1 S:4:128:64,L:T:m:L:L,A:256:32,16:0,32
+-gpgpu_l1_latency 32
 -gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 20
--gpgpu_smem_latency 20
 -gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_cache_write_ratio 25
+
+# shared memory  configuration
+-gpgpu_shmem_size 65536
+-gpgpu_shmem_sizeDefault 65536
+-gpgpu_shmem_per_block 49152
+-gpgpu_smem_latency 30
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 75
 
-# 64 sets, each 128 bytes 16-way for each memory sub partition (128 KB per memory sub partition). This gives us 3MB L2 cache
+# L2 cache
 -gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
@@ -122,34 +120,31 @@
 -gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
 -gpgpu_inst_fetch_throughput 4
 # 128 KB Tex
-# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
 -gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
 # 64 KB Const
 -gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
 -gpgpu_perfect_inst_const_cache 1
 
 # interconnection
-#-network_mode 1 
-#-inter_config_file config_turing_islip.icnt
 # use built-in local xbar
 -network_mode 2
 -icnt_in_buffer_limit 512
 -icnt_out_buffer_limit 512
 -icnt_subnets 2
--icnt_arbiter_algo 1
 -icnt_flit_size 40
+-icnt_arbiter_algo 1
 
 # memory partition latency config 
--gpgpu_l2_rop_latency 160
--dram_latency 100
+-gpgpu_l2_rop_latency 194
+-dram_latency 96
 
-# dram model config
+# dram sched config
 -gpgpu_dram_scheduler 1
 -gpgpu_frfcfs_dram_sched_queue_size 64
 -gpgpu_dram_return_queue_size 192
 
-# Turing has GDDR6
-# http://monitorinsider.com/GDDR6.html
+# dram model config
 -gpgpu_n_mem_per_ctrlr 1
 -gpgpu_dram_buswidth 2
 -gpgpu_dram_burst_length 16
@@ -157,9 +152,9 @@
 -gpgpu_mem_address_mask 1
 -gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
 
-# Use the same GDDR5 timing, scaled to 3500MHZ
--gpgpu_dram_timing_opt "nbk=16:CCD=4:RRD=10:RCD=20:RAS=50:RP=20:RC=62:
-                        CL=20:WL=8:CDLR=9:WR=20:nbkgrp=4:CCDL=4:RTPL=4"
+# Mem timing 
+-gpgpu_dram_timing_opt nbk=16:CCD=4:RRD=12:RCD=24:RAS=55:RP=24:RC=78:CL=24:WL=8:CDLR=10:WR=24:nbkgrp=4:CCDL=6:RTPL=4
+-dram_dual_bus_interface 0
 
 # select lower bits for bnkgrp to increase bnkgrp parallelism
 -dram_bnk_indexing_policy 0
@@ -174,11 +169,10 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model for Volta
+# power model configs, disable it untill we create a real energy model
 -power_simulation_enabled 0
 
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
-#-trace_sampling_core 0
-
+#-trace_sampling_core 0
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/config_turing_islip.icnt b/configs/tested-cfgs/SM75_RTX2060_S/config_turing_islip.icnt
new file mode 100644
index 000000000..eed1c34b6
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/config_turing_islip.icnt
@@ -0,0 +1,73 @@
+//52*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 52;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 64;
+input_buffer_size = 256;
+ejection_buffer_size = 64;
+boundary_buffer_size = 64;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
new file mode 100644
index 000000000..0fb4742e1
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
@@ -0,0 +1,210 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+# This config models the Turing RTX 2060 Super
+# For more info about turing architecture:
+# 1- https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/technologies/turing-architecture/NVIDIA-Turing-Architecture-Whitepaper.pdf
+# 2- "RTX on—The NVIDIA Turing GPU", IEEE MICRO 2020
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 75
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 5
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 34
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 16
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1905.0:1905.0:1905.0:3500.0
+# boost mode
+# -gpgpu_clock_domains 1680.0:1680.0:1680.0:3500.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 75
+
+# This implies a maximum of 32 warps/SM
+-gpgpu_shader_core_pipeline 1024:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Turing has 4 SP SIMD units, 4 INT units, 4 SFU units, 8 Tensor core units
+## We need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,0,4,4,4,4,0,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,32
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Turing has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Turing, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 75
+
+# Trung has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# turing has 8 banks dual-port, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+-gpgpu_adaptive_cache_config 0
+-gpgpu_l1_banks 4
+-gpgpu_cache:dl1  S:1:128:512,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_shmem_size 65536
+-gpgpu_shmem_sizeDefault 65536
+-gpgpu_shmem_per_block 65536
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_latency 20
+-gpgpu_smem_latency 20
+-gpgpu_flush_l1_cache 1
+
+# 64 sets, each 128 bytes 16-way for each memory sub partition (128 KB per memory sub partition). This gives us 4MB L2 cache
+-gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 1
+-gpgpu_memory_partition_indexing 0
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_turing_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_arbiter_algo 1
+-icnt_flit_size 40
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# Turing has GDDR6
+# http://monitorinsider.com/GDDR6.html
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 2
+-gpgpu_dram_burst_length 16
+-dram_data_command_freq_ratio 4
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
+
+# Use the same GDDR5 timing, scaled to 3500MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=4:RRD=10:RCD=20:RAS=50:RP=20:RC=62:
+                        CL=20:WL=8:CDLR=9:WR=20:nbkgrp=4:CCDL=4:RTPL=4"
+
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_normal/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_normal/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_normal/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_normal/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_normal/gpgpusim.config_base_mee_normal b/configs/tested-cfgs/SM7_QV100/base_mee_normal/gpgpusim.config_base_mee_normal
new file mode 100644
index 000000000..08fe73486
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_normal/gpgpusim.config_base_mee_normal
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 2000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  N:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 N:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
+-gpgpu_cache:dmeta N:4:128:4,L:B:m:L:P,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector/gpgpusim.config_base_mee_sector
new file mode 100644
index 000000000..f3ecca62b
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector/gpgpusim.config_base_mee_sector
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 2000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:P,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:P,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/gpgpusim.config_base_mee_sector_L2_4MB b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/gpgpusim.config_base_mee_sector_L2_4MB
new file mode 100644
index 000000000..23ce56ad6
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_4MB/gpgpusim.config_base_mee_sector_L2_4MB
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 2000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:16,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:P,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/gpgpusim.config_base_mee_sector
new file mode 100644
index 000000000..c468dd8bc
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_L2_6MB/gpgpusim.config_base_mee_sector
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 2000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:P,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/gpgpusim.config_base_mee_sector_large_mdc_16KB b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/gpgpusim.config_base_mee_sector_large_mdc_16KB
new file mode 100644
index 000000000..736c5d7f8
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_16KB/gpgpusim.config_base_mee_sector_large_mdc_16KB
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 2000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:32:128:4,L:B:m:L:P,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/gpgpusim.config_base_mee_sector
new file mode 100644
index 000000000..d09d76b43
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_2KB/gpgpusim.config_base_mee_sector
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 2000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,P:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/gpgpusim.config_base_mee_sector_large_mdc_32KB b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/gpgpusim.config_base_mee_sector_large_mdc_32KB
new file mode 100644
index 000000000..775755039
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_32KB/gpgpusim.config_base_mee_sector_large_mdc_32KB
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 2000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:8:4,32:0,32
+-gpgpu_cache:dmeta S:64:128:4,L:B:m:L:P,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/gpgpusim.config_base_mee_sector_large_mdc_4KB b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/gpgpusim.config_base_mee_sector_large_mdc_4KB
new file mode 100644
index 000000000..cc2a9a55d
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_4KB/gpgpusim.config_base_mee_sector_large_mdc_4KB
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 2000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:8:128:4,L:B:m:L:P,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/gpgpusim.config_base_mee_sector_large_mdc_64KB b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/gpgpusim.config_base_mee_sector_large_mdc_64KB
new file mode 100644
index 000000000..d7d1124e7
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_64KB/gpgpusim.config_base_mee_sector_large_mdc_64KB
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 2000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:128:128:4,L:B:m:L:P,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/gpgpusim.config_base_mee_sector_large_mdc_8KB b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/gpgpusim.config_base_mee_sector_large_mdc_8KB
new file mode 100644
index 000000000..62fd4494c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_large_mdc_8KB/gpgpusim.config_base_mee_sector_large_mdc_8KB
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 2000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:16:128:4,L:B:m:L:P,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/gpgpusim.config_base_mee_sector_mdc_4x16 b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/gpgpusim.config_base_mee_sector_mdc_4x16
new file mode 100644
index 000000000..9786d2436
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x16/gpgpusim.config_base_mee_sector_mdc_4x16
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 2000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:16,L:B:m:L:P,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/gpgpusim.config_base_mee_sector_mdc_4x8 b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/gpgpusim.config_base_mee_sector_mdc_4x8
new file mode 100644
index 000000000..f0ce0c712
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mdc_4x8/gpgpusim.config_base_mee_sector_mdc_4x8
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 2000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:8,L:B:m:L:P,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/gpgpusim.config_base_mee_sector_mshr_128 b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/gpgpusim.config_base_mee_sector_mshr_128
new file mode 100644
index 000000000..1c39f19da
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_128/gpgpusim.config_base_mee_sector_mshr_128
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 2000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:128:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/gpgpusim.config_base_mee_sector_mshr_32 b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/gpgpusim.config_base_mee_sector_mshr_32
new file mode 100644
index 000000000..06bb4a298
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_32/gpgpusim.config_base_mee_sector_mshr_32
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 2000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:32:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/gpgpusim.config_base_mee_sector_mshr_64 b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/gpgpusim.config_base_mee_sector_mshr_64
new file mode 100644
index 000000000..d0316d715
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_mshr_64/gpgpusim.config_base_mee_sector_mshr_64
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 2000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/config_volta_islip.icnt b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/gpgpusim.config_base_mee_sector b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/gpgpusim.config_base_mee_sector
new file mode 100644
index 000000000..d0316d715
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/base_mee_sector_perf_mdc/gpgpusim.config_base_mee_sector
@@ -0,0 +1,250 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+-gpgpu_max_cycle 2000000
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+#-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+#-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+#-gpgpu_cache:dmeta N:4:128:4,L:B:m:W:X,A:64:4,32:0,32
+-gpgpu_cache:dmeta S:4:128:4,L:B:m:L:X,A:64:64,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 0
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+#-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,A:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+#-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,F:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+#-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,A:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
+# power simulation
+-power_simulation_enabled 1 
+-power_simulation_mode 0
+-accelwattch_xml_file accelwattch_sass_sim.xml
+
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index c4818d10f..76c99b7d6 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -1,4 +1,34 @@
-# This config models the Volta
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
 # For more info about volta architecture:
 # http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
 # https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
@@ -34,10 +64,11 @@
 -gpgpu_n_cores_per_cluster 1
 -gpgpu_n_mem 32
 -gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
 
 # volta clock domains
 #-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
--gpgpu_clock_domains 1132.0:1132.0:1132.0:850.0
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
 # boost mode
 # -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
 
@@ -94,12 +125,12 @@
 -gpgpu_shmem_num_banks 32
 -gpgpu_shmem_limited_broadcast 0
 -gpgpu_shmem_warp_parts 1
--gpgpu_coalesce_arch 60
+-gpgpu_coalesce_arch 70
 
 # Volta has four schedulers per core
 -gpgpu_num_sched_per_core 4
 # Greedy then oldest scheduler
--gpgpu_scheduler gto
+-gpgpu_scheduler lrr
 ## In Volta, a warp scheduler can issue 1 inst per cycle
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1
@@ -113,17 +144,21 @@
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 # disable this mode in case of multi kernels/apps execution
 -gpgpu_adaptive_cache_config 1
-# Volta unified cache has four banks
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
 -gpgpu_shmem_size 98304
 -gpgpu_shmem_sizeDefault 98304
 -gpgpu_shmem_per_block 65536
--gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
--gpgpu_flush_l1_cache 1
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
@@ -195,11 +230,7 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model for Volta
--power_simulation_enabled 0
-
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
-#-trace_sampling_core 0
-
+#-trace_sampling_core 0
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/hw_perf.csv b/configs/tested-cfgs/SM7_QV100/hw_perf.csv
new file mode 100644
index 000000000..aa88bb256
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/hw_perf.csv
@@ -0,0 +1,26 @@
+Benchmark,Kernel,L1_RH,L1_RM,L1_WH,L1_WM,CC_ACC,SHRD_ACC,DRAM_Rd,DRAM_Wr,L2_RH,L2_RM,L2_WH,L2_WM,NOC,Pipeline_Duty,Num_Idle_SMs,Elapsed_Cycles,Chip Voltage
+b+tree-rodinia-3.1,findRangeK,1634256.0,561818.0,40785.0,19032.0,0.0,0.0,259346.0,3524.0,396522.0,259508.0,60000.0,0.0,1343246.0,0.3268163900773488,5.064000000000002,66542.7,1.0
+b+tree-rodinia-3.1,findK,1318908.0,525035.0,42619.0,7404.0,0.0,0.0,255317.0,2582.0,366918.0,255364.0,50000.0,0.0,1250108.0,0.2740918672650619,3.191999999999995,80883.0,1.0
+backprop-rodinia-3.1,_Z22bpnn_layerforward_CUDAPfS_S_S_ii,49152.0,143738.0,192432.0,4232.0,0.0,413696.0,147464.0,60097.0,29059.0,147460.0,196608.0,0.0,704512.0,0.5619432556155418,7.520000000000007,23324.775,1.0
+backprop-rodinia-3.1,_Z24bpnn_adjust_weights_cudaPfiS_iS_S_,465990.0,277805.0,327015.0,887.0,0.0,0.0,286738.0,190646.0,54315.0,286734.0,327686.0,0.0,1263518.0,0.20116733697224465,9.496000000000002,32578.425,1.0
+hotspot-rodinia-3.1,_Z14calculate_tempiPfS_S_iiiiffffff,4250.0,691050.0,0.0,175104.0,0.0,997428.0,262147.0,66263.0,486965.0,262144.0,175104.0,0.0,1732988.0,0.9470499252952201,3.3200000000000074,56438.825,1.0
+kmeans-rodinia-3.1,_Z11kmeansPointPfiiiPiS_S_S0_,0.0,0.0,0.0,102400.0,4352107.0,0.0,12302960.0,92472.5,6742186.0,12321532.0,102400.0,0.0,26022036.0,0.11420395712434231,1.5799999999999947,894550.775,1.0
+srad_v1-rodinia-3.1,_Z4sradfiilPiS_S_S_PfS0_S0_S0_fS0_S0_,158304.87000000002,89035.40999999999,0.0,143700.0,0.0,0.0,28986.500000000033,45424.200000000026,68135.7,28984.00000000001,143700.0,0.0,481258.2600000001,0.5320091849844065,15.272880000000004,14251.741749999997,1.0
+parboil-sad,_Z11mb_sad_calcPtS_ii,101840.0,415925.0,2102177.0,7289373.0,0.0,10033920.0,257308.0,8720433.0,8754664.0,257280.0,9390720.0,0.0,36398656.0,0.25130932753519797,0.19199999999999662,6551129.125,1.0
+parboil-sgemm,_Z9mysgemmNTPKfiS0_iPfiiff,7109956.0,2452728.0,133388.0,1284.0,0.0,8642304.0,393092.0,36894.0,2059512.0,393088.0,135168.0,0.0,5176696.0,0.5495706862295477,1.8799999999999972,358744.025,1.0
+parboil-mri-q,_Z12ComputeQ_GPUiiPfS_S_S_S_,0.0,163840.0,65184.0,154.0,17617612.5,0.0,164356.0,0.0,0.0,163840.0,65536.0,0.0,458752.0,0.5767256645623982,12.363999999999997,691892.925,1.0
+dct8x8,_Z14CUDAkernel1DCTPfiiiy,0.0,0.0,552.8,32121.9,786431.9999999999,114688.00000000001,32786.0,0.0,16383.999999999998,32767.999999999996,32767.999999999996,0.0,131071.99999999999,0.06091433507559575,7.7799999999999985,24207.632500000003,1.0
+dct8x8,_Z14CUDAkernel2DCTPfS_i,0.0,32768.00000000002,0.0,32768.00000000002,0.0,49152.00000000004,32773.25742574254,0.0,0.0,32768.00000000002,32768.00000000002,0.0,131072.0000000001,0.14345732731755537,30.750257425742568,5822.941584158416,1.0
+binomialOptions,_Z21binomialOptionsKernelv,0.0,0.0,0.0,1024.0,23688.0,16778240.0,640.0,0.0,0.0,0.0,1024.0,0.0,2048.0,0.6457304629145744,1.9519999999999982,1366301.225,1.0
+fastWalshTransform,_Z15fwtBatch2KernelPfS_i,0.0,1048576.0000000002,774120.4444444445,271536.22222222225,0.0,0.0,1048581.888888889,945003.222222222,0.0,1048576.0000000002,1048576.0000000002,0.0,4194304.000000001,0.0867005928407203,2.574222222222223,120947.73472222223,1.0
+fastWalshTransform,_Z15fwtBatch1KernelPfS_i,0.0,1048576.0,645060.0,403890.6666666666,0.0,3407872.0,1048581.0,950303.3333333333,0.0,1048576.0,1048576.0,0.0,4194304.0,0.3836524328760675,2.621333333333329,149487.8,1.0
+histogram,_Z17histogram64KernelPjP5uint4j,0.0,2097152.0,0.0,34960.0,0.0,4893504.000000001,2097184.2941176468,26959.294117647052,0.0,2097152.0,34960.0,0.0,4264223.999999999,0.3361853461559831,3.706823529411762,146480.14411764703,1.0
+mergeSort,_Z21mergeSortSharedKernelILj1EEvPjS0_S0_S0_j,0.0,1048576.0,0.0,1048576.0,0.0,12976128.0,1048580.0,950169.0,0.0,1048576.0,1048576.0,0.0,4194304.0,0.9137102229423307,1.1600000000000055,439316.525,1.0
+mergeSort,_Z30mergeElementaryIntervalsKernelILj1EEvPjS0_S0_S0_S0_S0_jj,152481.75,1127706.3333333333,439852.24999999994,829969.9166666665,0.0,3670010.1666666665,1056772.0000000002,959704.0833333334,199523.16666666666,1056768.0,1269875.1666666667,0.0,4878632.833333334,0.44812863772322986,1.6420000000000003,157457.05,1.0
+quasirandomGenerator,_Z26quasirandomGeneratorKernelPfjj,0.0,0.0,0.0,393215.9999999999,47616.000000000015,0.0,21.0,294938.38095238095,0.0,0.0,393215.9999999999,0.0,786431.9999999998,0.6109600290450061,17.68266666666667,80626.8130952381,1.0
+quasirandomGenerator,_Z16inverseCNDKernelPfPjj,0.0,0.0,0.0,393215.9999999999,0.0,0.0,5.952380952380952,294941.6666666666,0.0,0.0,393215.9999999999,0.0,786431.9999999998,0.307434624439692,5.790476190476192,58367.4988095238,1.0
+sobolQRNG,_Z15sobolGPU_kerneljjPjPf,172832.0,31976.0,0.0,1250000.0,0.0,1899700.0,405.0,1151641.0,31592.0,400.0,1250000.0,0.0,2563936.0,0.6380044567750587,2.7840000000000042,112087.775,1.0
+cutlass_perf_test_k1,_ZN7cutlass4gemm16gemm_kernel_nolbINS0_12GemmMainloopINS0_10GemmTraitsINS0_14WmmaGemmConfigILNS_12MatrixLayout4KindE1ELS6_1ENS_5ShapeILi64ELi128ELi128ELi1EEE6__halfS9_ffNS7_ILi64ELi32ELi64ELi1EEENS7_ILi16ELi16ELi16ELi1EEELi8ELi8ELi8ELi8ELi4ELi4ELi4EEENS0_16GlobalLoadStreamILNS_11GemmOperand4KindE0ENS0_20GemmGlobalIteratorAbINS0_20GemmGlobalTileTraitsILSF_0ELS6_1EKS9_NS7_ILi1ELi64ELi128ELi1EEENS7_ILi1ELi8ELi32ELi1EEELi8EEEiEENS_17TileStoreIteratorINS0_27GemmSharedStoreTileAbTraitsIS9_NS7_ILi1ELi64ELi136ELi1EEENS7_ILi1ELi16ELi16ELi1EEELi8EEES9_LNS_15IteratorAdvance4KindE1ELNS_11MemorySpace4KindE1EiS9_LNS_19FragmentElementType4KindE0ENS7_ILi0ELi0ELi0ELi0EEEEENS_4CopyINS_8FragmentIS9_Li32ELm16EEEEEEENSD_ILSF_1ENSG_INSH_ILSF_1ELS6_1ESI_NS7_ILi1ELi128ELi64ELi1EEENS7_ILi1ELi4ELi64ELi1EEELi8EEEiEENSN_INSO_IS9_NS7_ILi1ELi128ELi72ELi1EEENS7_ILi1ELi32ELi8ELi1EEELi8EEES9_LST_1ELSV_1EiS9_LSX_0ESY_EES13_EENS0_16SharedLoadStreamINS_16TileLoadIteratorINS0_29WmmaGemmSharedLoadTileATraitsILS6_1ES9_SP_NS7_ILi1ELi4ELi2ELi1EEELi16ENS7_ILi1ELi1ELi4ELi1EEENS7_ILi2176ELi0ELi32ELi0EEESB_EES9_LST_1ELSV_1EiNS_10WmmaMatrixILSF_0ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1M_Li4ELm16EEEEEEENS1E_INS1F_INS0_29WmmaGemmSharedLoadTileBTraitsILS6_1ES9_S19_S1H_Li1152ENS7_ILi1ELi1ELi2ELi1EEENS7_ILi16ELi0ELi4608ELi1EEESB_EES9_LST_1ELSV_1EiNS1L_ILSF_1ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1V_Li2ELm16EEEEEEENS0_12GemmEpilogueINS0_28SimplifiedGemmEpilogueTraitsISC_NS0_13LinearScalingIfNS0_19FragmentMultiplyAddIffLb1EEEEEiNS0_28WmmaGemmEpilogueTraitsHelperISC_fS25_iEEEEEENS0_20IdentityBlockSwizzleEiNS0_17ClearAccumulatorsIfLi1EEEEEEEEEvNT_6ParamsE,0.0,460800.0,0.0,5120.0,160.0,577120.0000000001,412167.99999999994,42.285714285714285,48640.0,412160.0,5120.0,0.0,931840.0,0.24658369358809393,60.32228571428572,139808.59999999998,1.0
+cutlass_perf_test_k2,_ZN7cutlass4gemm16gemm_kernel_nolbINS0_12GemmMainloopINS0_10GemmTraitsINS0_14WmmaGemmConfigILNS_12MatrixLayout4KindE1ELS6_1ENS_5ShapeILi64ELi128ELi128ELi1EEE6__halfS9_ffNS7_ILi64ELi32ELi64ELi1EEENS7_ILi16ELi16ELi16ELi1EEELi8ELi8ELi8ELi8ELi4ELi4ELi4EEENS0_16GlobalLoadStreamILNS_11GemmOperand4KindE0ENS0_20GemmGlobalIteratorAbINS0_20GemmGlobalTileTraitsILSF_0ELS6_1EKS9_NS7_ILi1ELi64ELi128ELi1EEENS7_ILi1ELi8ELi32ELi1EEELi8EEEiEENS_17TileStoreIteratorINS0_27GemmSharedStoreTileAbTraitsIS9_NS7_ILi1ELi64ELi136ELi1EEENS7_ILi1ELi16ELi16ELi1EEELi8EEES9_LNS_15IteratorAdvance4KindE1ELNS_11MemorySpace4KindE1EiS9_LNS_19FragmentElementType4KindE0ENS7_ILi0ELi0ELi0ELi0EEEEENS_4CopyINS_8FragmentIS9_Li32ELm16EEEEEEENSD_ILSF_1ENSG_INSH_ILSF_1ELS6_1ESI_NS7_ILi1ELi128ELi64ELi1EEENS7_ILi1ELi4ELi64ELi1EEELi8EEEiEENSN_INSO_IS9_NS7_ILi1ELi128ELi72ELi1EEENS7_ILi1ELi32ELi8ELi1EEELi8EEES9_LST_1ELSV_1EiS9_LSX_0ESY_EES13_EENS0_16SharedLoadStreamINS_16TileLoadIteratorINS0_29WmmaGemmSharedLoadTileATraitsILS6_1ES9_SP_NS7_ILi1ELi4ELi2ELi1EEELi16ENS7_ILi1ELi1ELi4ELi1EEENS7_ILi2176ELi0ELi32ELi0EEESB_EES9_LST_1ELSV_1EiNS_10WmmaMatrixILSF_0ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1M_Li4ELm16EEEEEEENS1E_INS1F_INS0_29WmmaGemmSharedLoadTileBTraitsILS6_1ES9_S19_S1H_Li1152ENS7_ILi1ELi1ELi2ELi1EEENS7_ILi16ELi0ELi4608ELi1EEESB_EES9_LST_1ELSV_1EiNS1L_ILSF_1ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1V_Li2ELm16EEEEEEENS0_12GemmEpilogueINS0_28SimplifiedGemmEpilogueTraitsISC_NS0_13LinearScalingIfNS0_19FragmentMultiplyAddIffLb1EEEEEiNS0_28WmmaGemmEpilogueTraitsHelperISC_fS25_iEEEEEENS0_20IdentityBlockSwizzleEiNS0_17ClearAccumulatorsIfLi1EEEEEEEEEvNT_6ParamsE,0.0,2097151.9999999995,171796.0,65782.85714285714,255.99999999999994,1464319.9999999998,1081352.2857142857,45.42857142857143,1015808.0000000002,1081344.0,237568.0,0.0,4669440.0,0.38530040572560803,48.440000000000005,228263.9035714286,1.0
+cutlass_perf_test_k3,_ZN7cutlass4gemm16gemm_kernel_nolbINS0_12GemmMainloopINS0_10GemmTraitsINS0_14WmmaGemmConfigILNS_12MatrixLayout4KindE1ELS6_1ENS_5ShapeILi64ELi128ELi128ELi1EEE6__halfS9_ffNS7_ILi64ELi32ELi64ELi1EEENS7_ILi16ELi16ELi16ELi1EEELi8ELi8ELi8ELi8ELi4ELi4ELi4EEENS0_16GlobalLoadStreamILNS_11GemmOperand4KindE0ENS0_20GemmGlobalIteratorAbINS0_20GemmGlobalTileTraitsILSF_0ELS6_1EKS9_NS7_ILi1ELi64ELi128ELi1EEENS7_ILi1ELi8ELi32ELi1EEELi8EEEiEENS_17TileStoreIteratorINS0_27GemmSharedStoreTileAbTraitsIS9_NS7_ILi1ELi64ELi136ELi1EEENS7_ILi1ELi16ELi16ELi1EEELi8EEES9_LNS_15IteratorAdvance4KindE1ELNS_11MemorySpace4KindE1EiS9_LNS_19FragmentElementType4KindE0ENS7_ILi0ELi0ELi0ELi0EEEEENS_4CopyINS_8FragmentIS9_Li32ELm16EEEEEEENSD_ILSF_1ENSG_INSH_ILSF_1ELS6_1ESI_NS7_ILi1ELi128ELi64ELi1EEENS7_ILi1ELi4ELi64ELi1EEELi8EEEiEENSN_INSO_IS9_NS7_ILi1ELi128ELi72ELi1EEENS7_ILi1ELi32ELi8ELi1EEELi8EEES9_LST_1ELSV_1EiS9_LSX_0ESY_EES13_EENS0_16SharedLoadStreamINS_16TileLoadIteratorINS0_29WmmaGemmSharedLoadTileATraitsILS6_1ES9_SP_NS7_ILi1ELi4ELi2ELi1EEELi16ENS7_ILi1ELi1ELi4ELi1EEENS7_ILi2176ELi0ELi32ELi0EEESB_EES9_LST_1ELSV_1EiNS_10WmmaMatrixILSF_0ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1M_Li4ELm16EEEEEEENS1E_INS1F_INS0_29WmmaGemmSharedLoadTileBTraitsILS6_1ES9_S19_S1H_Li1152ENS7_ILi1ELi1ELi2ELi1EEENS7_ILi16ELi0ELi4608ELi1EEESB_EES9_LST_1ELSV_1EiNS1L_ILSF_1ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1V_Li2ELm16EEEEEEENS0_12GemmEpilogueINS0_28SimplifiedGemmEpilogueTraitsISC_NS0_13LinearScalingIfNS0_19FragmentMultiplyAddIffLb1EEEEEiNS0_28WmmaGemmEpilogueTraitsHelperISC_fS25_iEEEEEENS0_20IdentityBlockSwizzleEiNS0_17ClearAccumulatorsIfLi1EEEEEEEEEvNT_6ParamsE,0.0,3276800.0000000005,429682.85714285716,164204.57142857142,640.0,2309120.0,491527.9999999999,77869.28571428571,2785279.9999999995,491519.99999999994,593920.0000000001,0.0,7741440.0,0.8525726478636384,1.832,161781.07857142857,1.0
+cudaTensorCoreGemm,_Z12compute_gemmPK6__halfS1_PKfPfff,0.0,69206016.0,0.0,2097152.0,0.0,30146560.0,16974052.0,1998866.0,52232060.0,16973824.0,2097152.0,0.0,142606336.0,0.7380984268363922,1.264000000000003,3871172.375,1.0
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index 3fa51ee14..5c6be224a 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -100,7 +100,7 @@
 # Volta has four schedulers per core
 -gpgpu_num_sched_per_core 4
 # Greedy then oldest scheduler
--gpgpu_scheduler gto
+-gpgpu_scheduler lrr
 ## In Volta, a warp scheduler can issue 1 inst per cycle
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1
@@ -114,17 +114,21 @@
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 # disable this mode in case of multi kernels/apps execution
 -gpgpu_adaptive_cache_config 1
-# Volta unified cache has four banks
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_l1_latency 20
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
 -gpgpu_shmem_size 98304
 -gpgpu_shmem_sizeDefault 98304
 -gpgpu_shmem_per_block 65536
--gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
--gpgpu_flush_l1_cache 1
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 4.5MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
@@ -196,9 +200,6 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model for Volta
--power_simulation_enabled 0
-
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
diff --git a/configs/tested-cfgs/SM86_RTX3070/config_ampere_islip.icnt b/configs/tested-cfgs/SM86_RTX3070/config_ampere_islip.icnt
new file mode 100644
index 000000000..6775d5d6f
--- /dev/null
+++ b/configs/tested-cfgs/SM86_RTX3070/config_ampere_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 78;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
new file mode 100644
index 000000000..854378151
--- /dev/null
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -0,0 +1,179 @@
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 86
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 8
+-gpgpu_compute_capability_minor 6
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 46
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 16
+-gpgpu_n_sub_partition_per_mchannel 2
+
+# clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1132:1132:1132:3500.5
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 86
+
+-gpgpu_shader_core_pipeline 1536:32
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,4,4,4,21
+-ptx_opcode_initiation_int 2,2,2,2,2
+-ptx_opcode_latency_fp 4,4,4,4,39
+-ptx_opcode_initiation_fp 1,1,1,1,2
+-ptx_opcode_latency_dp 64,64,64,64,330
+-ptx_opcode_initiation_dp 64,64,64,64,130
+-ptx_opcode_latency_sfu 21
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# sub core model: in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# register banks
+-gpgpu_num_reg_banks 8
+-gpgpu_reg_file_port_throughput 2
+
+# warp scheduling
+-gpgpu_num_sched_per_core 4
+-gpgpu_scheduler lrr
+# a warp scheduler issue mode
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,100
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+-gpgpu_cache:dl1 S:4:128:256,L:T:m:L:L,A:384:48,16:0,32
+-gpgpu_l1_latency 39
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_cache_write_ratio 25
+
+# shared memory  configuration
+-gpgpu_shmem_size 102400
+-gpgpu_shmem_sizeDefault 102400
+-gpgpu_shmem_per_block 49152
+-gpgpu_smem_latency 29
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 86
+
+# L2 cache
+-gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 1
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 187
+-dram_latency 254
+
+# dram sched config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# dram model config
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 2
+-gpgpu_dram_burst_length 16
+-dram_data_command_freq_ratio 4
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
+
+# Mem timing 
+-gpgpu_dram_timing_opt nbk=16:CCD=4:RRD=12:RCD=24:RAS=55:RP=24:RC=78:CL=24:WL=8:CDLR=10:WR=24:nbkgrp=4:CCDL=6:RTPL=4
+-dram_dual_bus_interface 0
+
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# power model configs, disable it untill we create a real energy model
+-power_simulation_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
diff --git a/format-code.sh b/format-code.sh
index fb1cc909a..ac753f059 100755
--- a/format-code.sh
+++ b/format-code.sh
@@ -8,8 +8,5 @@ clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.h
 clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.cc
 clang-format -i ${THIS_DIR}/src/cuda-sim/*.h
 clang-format -i ${THIS_DIR}/src/cuda-sim/*.cc
-clang-format -i ${THIS_DIR}/src/gpuwattch/*.h
-clang-format -i ${THIS_DIR}/src/gpuwattch/*.cc
-clang-format -i ${THIS_DIR}/src/trace-driven/*.h
-clang-format -i ${THIS_DIR}/src/trace-driven/*.cc
-clang-format -i ${THIS_DIR}/src/trace-driven/ISA_Def/*.h
+clang-format -i ${THIS_DIR}/src/accelwattch/*.h
+clang-format -i ${THIS_DIR}/src/accelwattch/*.cc
\ No newline at end of file
diff --git a/setup_environment b/setup_environment
index 07d078844..d3ff8403c 100644
--- a/setup_environment
+++ b/setup_environment
@@ -117,18 +117,18 @@ fi
 
 # The following checks to see if the GPGPU-Sim power model is enabled.
 # GPGPUSIM_POWER_MODEL points to the directory where gpgpusim_mcpat is located.
-# If this is not set, it checks the default directory "$GPGPUSIM_ROOT/src/gpuwattch/".
-if [ -d $GPGPUSIM_ROOT/src/gpuwattch/ ]; then
-	if [ ! -f $GPGPUSIM_ROOT/src/gpuwattch/gpgpu_sim.verify ]; then
-		echo "ERROR ** gpgpu_sim.verify not found in $GPGPUSIM_ROOT/src/gpuwattch";
+# If this is not set, it checks the default directory "$GPGPUSIM_ROOT/src/accelwattch/".
+if [ -d $GPGPUSIM_ROOT/src/accelwattch/ ]; then
+	if [ ! -f $GPGPUSIM_ROOT/src/accelwattch/gpgpu_sim.verify ]; then
+		echo "ERROR ** gpgpu_sim.verify not found in $GPGPUSIM_ROOT/src/accelwattch";
 		return;
 	fi
-	export GPGPUSIM_POWER_MODEL=$GPGPUSIM_ROOT/src/gpuwattch/;
-	echo "configured with GPUWattch.";
+	export GPGPUSIM_POWER_MODEL=$GPGPUSIM_ROOT/src/accelwattch/;
+	echo "configured with AccelWattch.";
 elif [ -n "$GPGPUSIM_POWER_MODEL" ]; then
 	if [ ! -f $GPGPUSIM_POWER_MODEL/gpgpu_sim.verify ]; then
 		echo "";
-		echo "ERROR ** gpgpu_sim.verify not found in $GPGPUSIM_ROOT/src/gpuwattch/ - Either incorrect directory or incorrect McPAT version";
+		echo "ERROR ** gpgpu_sim.verify not found in $GPGPUSIM_ROOT/src/accelwattch/ - Either incorrect directory or incorrect McPAT version";
 		return;
 	fi
 	echo "configure with power model in $GPGPUSIM_POWER_MODEL.";
diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
index 5ad6f105d..208047eeb 100644
--- a/src/abstract_hardware_model.cc
+++ b/src/abstract_hardware_model.cc
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Inderpreet Singh, Timothy Rogers,
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Inderpreet Singh, Timothy Rogers, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -26,6 +27,7 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 
+
 #include "abstract_hardware_model.h"
 #include <sys/stat.h>
 #include <algorithm>
@@ -205,8 +207,8 @@ gpgpu_t::gpgpu_t(const gpgpu_functional_sim_config &config, gpgpu_context *ctx)
   gpu_tot_sim_cycle = 0;
 }
 
-address_type line_size_based_tag_func(new_addr_type address,
-                                      new_addr_type line_size) {
+new_addr_type line_size_based_tag_func(new_addr_type address,
+                                       new_addr_type line_size) {
   // gives the tag for an address based on a given line size
   return address & ~(line_size - 1);
 }
@@ -281,14 +283,16 @@ void warp_inst_t::broadcast_barrier_reduction(
 void warp_inst_t::generate_mem_accesses() {
   if (empty() || op == MEMORY_BARRIER_OP || m_mem_accesses_created) return;
   if (!((op == LOAD_OP) || (op == TENSOR_CORE_LOAD_OP) || (op == STORE_OP) ||
-        (op == TENSOR_CORE_STORE_OP)))
+        (op == TENSOR_CORE_STORE_OP) ))
     return;
   if (m_warp_active_mask.count() == 0) return;  // predicated off
 
   const size_t starting_queue_size = m_accessq.size();
 
   assert(is_load() || is_store());
-  assert(m_per_scalar_thread_valid);  // need address information per thread
+
+  //if((space.get_type() != tex_space) && (space.get_type() != const_space))
+    assert(m_per_scalar_thread_valid);  // need address information per thread
 
   bool is_write = is_store();
 
@@ -448,7 +452,8 @@ void warp_inst_t::generate_mem_accesses() {
     for (unsigned thread = 0; thread < m_config->warp_size; thread++) {
       if (!active(thread)) continue;
       new_addr_type addr = m_per_scalar_thread[thread].memreqaddr[0];
-      unsigned block_address = line_size_based_tag_func(addr, cache_block_size);
+      new_addr_type block_address =
+          line_size_based_tag_func(addr, cache_block_size);
       accesses[block_address].set(thread);
       unsigned idx = addr - block_address;
       for (unsigned i = 0; i < data_size; i++) byte_mask.set(idx + i);
@@ -530,7 +535,8 @@ void warp_inst_t::memory_coalescing_arch(bool is_write,
            (m_per_scalar_thread[thread].memreqaddr[access] != 0);
            access++) {
         new_addr_type addr = m_per_scalar_thread[thread].memreqaddr[access];
-        unsigned block_address = line_size_based_tag_func(addr, segment_size);
+        new_addr_type block_address =
+            line_size_based_tag_func(addr, segment_size);
         unsigned chunk =
             (addr & 127) / 32;  // which 32-byte chunk within in a 128-byte
                                 // chunk does this thread access?
@@ -552,7 +558,8 @@ void warp_inst_t::memory_coalescing_arch(bool is_write,
         if (block_address != line_size_based_tag_func(
                                  addr + data_size_coales - 1, segment_size)) {
           addr = addr + data_size_coales - 1;
-          unsigned block_address = line_size_based_tag_func(addr, segment_size);
+          new_addr_type block_address =
+              line_size_based_tag_func(addr, segment_size);
           unsigned chunk = (addr & 127) / 32;
           transaction_info &info = subwarp_transactions[block_address];
           info.chunks.set(chunk);
@@ -625,7 +632,8 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
       if (!active(thread)) continue;
 
       new_addr_type addr = m_per_scalar_thread[thread].memreqaddr[0];
-      unsigned block_address = line_size_based_tag_func(addr, segment_size);
+      new_addr_type block_address =
+          line_size_based_tag_func(addr, segment_size);
       unsigned chunk =
           (addr & 127) / 32;  // which 32-byte chunk within in a 128-byte chunk
                               // does this thread access?
diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 49f3e9f90..7ffc13940 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Inderpreet Singh,
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Inderpreet Singh, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -59,13 +60,37 @@ enum _memory_space_t {
   instruction_space
 };
 
+#ifndef COEFF_STRUCT
+#define COEFF_STRUCT
+
+struct PowerscalingCoefficients{
+    double int_coeff;
+    double int_mul_coeff;
+    double int_mul24_coeff;
+    double int_mul32_coeff;
+    double int_div_coeff;
+    double fp_coeff;
+    double dp_coeff;
+    double fp_mul_coeff;
+    double fp_div_coeff;
+    double dp_mul_coeff;
+    double dp_div_coeff;
+    double sqrt_coeff;
+    double log_coeff;
+    double sin_coeff;
+    double exp_coeff;
+    double tensor_coeff;
+    double tex_coeff;
+};
+#endif
+
 enum FuncCache {
   FuncCachePreferNone = 0,
   FuncCachePreferShared = 1,
   FuncCachePreferL1 = 2
 };
 
-enum AdaptiveCache { FIXED = 0, ADAPTIVE_VOLTA = 1 };
+enum AdaptiveCache { FIXED = 0, ADAPTIVE_CACHE = 1 };
 
 #ifdef __cplusplus
 
@@ -75,8 +100,8 @@ enum AdaptiveCache { FIXED = 0, ADAPTIVE_VOLTA = 1 };
 
 typedef unsigned long long new_addr_type;
 typedef unsigned long long cudaTextureObject_t;
-typedef unsigned address_type;
-typedef unsigned addr_t;
+typedef unsigned long long address_type;
+typedef unsigned long long addr_t;
 
 // the following are operations the timing model can see
 #define SPECIALIZED_UNIT_NUM 8
@@ -134,8 +159,14 @@ enum special_operations_t {
   FP_SQRT_OP,
   FP_LG_OP,
   FP_SIN_OP,
-  FP_EXP_OP
+  FP_EXP_OP,
+  DP_MUL_OP,
+  DP_DIV_OP,
+  DP___OP,
+  TENSOR__OP,
+  TEX__OP
 };
+
 typedef enum special_operations_t
     special_ops;  // Required to identify for the power model
 enum operation_pipeline_t {
@@ -373,6 +404,8 @@ class core_config {
   }
   unsigned mem_warp_parts;
   mutable unsigned gpgpu_shmem_size;
+  char *gpgpu_shmem_option;
+  std::vector<unsigned> shmem_opt_list;
   unsigned gpgpu_shmem_sizeDefault;
   unsigned gpgpu_shmem_sizePrefL1;
   unsigned gpgpu_shmem_sizePrefShared;
@@ -734,7 +767,8 @@ typedef std::bitset<SECTOR_CHUNCK_SIZE> mem_access_sector_mask_t;
   MA_TUP(GLOBAL_ACC_R), MA_TUP(LOCAL_ACC_R), MA_TUP(CONST_ACC_R),       \
       MA_TUP(TEXTURE_ACC_R), MA_TUP(GLOBAL_ACC_W), MA_TUP(LOCAL_ACC_W), \
       MA_TUP(L1_WRBK_ACC), MA_TUP(L2_WRBK_ACC), MA_TUP(INST_ACC_R),     \
-      MA_TUP(L1_WR_ALLOC_R), MA_TUP(L2_WR_ALLOC_R),                     \
+      MA_TUP(L1_WR_ALLOC_R), MA_TUP(L2_WR_ALLOC_R), MA_TUP(META_ACC),   \
+      MA_TUP(META_RBW), MA_TUP(META_WRBK_ACC), MA_TUP(META_WR_ALLOC_R),\
       MA_TUP(NUM_MEM_ACCESS_TYPE) MA_TUP_END(mem_access_type)
 
 #define MA_TUP_BEGIN(X) enum X {
@@ -869,6 +903,13 @@ class mem_fetch_allocator {
   virtual mem_fetch *alloc(const class warp_inst_t &inst,
                            const mem_access_t &access,
                            unsigned long long cycle) const = 0;
+  virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
+                           const active_mask_t &active_mask,
+                           const mem_access_byte_mask_t &byte_mask,
+                           const mem_access_sector_mask_t &sector_mask,
+                           unsigned size, bool wr, unsigned long long cycle,
+                           unsigned wid, unsigned sid, unsigned tpc,
+                           mem_fetch *original_mf) const = 0;
 };
 
 // the maximum number of destination, source, or address uarch operands in a
@@ -902,6 +943,7 @@ class inst_t {
     sp_op = OTHER_OP;
     op_pipe = UNKOWN_OP;
     mem_op = NOT_TEX;
+    const_cache_operand = 0;
     num_operands = 0;
     num_regs = 0;
     memset(out, 0, sizeof(unsigned));
@@ -930,6 +972,20 @@ class inst_t {
     return (op == STORE_OP || op == TENSOR_CORE_STORE_OP ||
             memory_op == memory_store);
   }
+
+  bool is_fp() const { return ((sp_op == FP__OP));}    //VIJAY
+  bool is_fpdiv() const { return ((sp_op == FP_DIV_OP));} 
+  bool is_fpmul() const { return ((sp_op == FP_MUL_OP));} 
+  bool is_dp() const { return ((sp_op == DP___OP));}    
+  bool is_dpdiv() const { return ((sp_op == DP_DIV_OP));} 
+  bool is_dpmul() const { return ((sp_op == DP_MUL_OP));}
+  bool is_imul() const { return ((sp_op == INT_MUL_OP));} 
+  bool is_imul24() const { return ((sp_op == INT_MUL24_OP));} 
+  bool is_imul32() const { return ((sp_op == INT_MUL32_OP));} 
+  bool is_idiv() const { return ((sp_op == INT_DIV_OP));}   
+  bool is_sfu() const {return ((sp_op == FP_SQRT_OP) || (sp_op == FP_LG_OP)  || (sp_op == FP_SIN_OP)  || (sp_op == FP_EXP_OP) || (sp_op == TENSOR__OP));}
+  bool is_alu() const {return (sp_op == INT__OP);}
+
   unsigned get_num_operands() const { return num_operands; }
   unsigned get_num_regs() const { return num_regs; }
   void set_num_regs(unsigned num) { num_regs = num; }
@@ -953,6 +1009,7 @@ class inst_t {
   operation_pipeline op_pipe;  // code (uarch visible) identify the pipeline of
                                // the operation (SP, SFU or MEM)
   mem_operation mem_op;        // code (uarch visible) identify memory type
+  bool const_cache_operand;   // has a load from constant memory as an operand
   _memory_op_t memory_op;      // memory_op used by ptxplus
   unsigned num_operands;
   unsigned num_regs;  // count vector operand as one register operand
@@ -1291,6 +1348,7 @@ class register_set {
     }
     m_name = name;
   }
+  const char *get_name() { return m_name; }
   bool has_free() {
     for (unsigned i = 0; i < regs.size(); i++) {
       if (regs[i]->empty()) {
@@ -1315,7 +1373,35 @@ class register_set {
     }
     return false;
   }
+  bool has_ready(bool sub_core_model, unsigned reg_id) {
+    if (!sub_core_model) return has_ready();
+    assert(reg_id < regs.size());
+    return (not regs[reg_id]->empty());
+  }
 
+  unsigned get_ready_reg_id() {
+    // for sub core model we need to figure which reg_id has the ready warp
+    // this function should only be called if has_ready() was true
+    assert(has_ready());
+    warp_inst_t **ready;
+    ready = NULL;
+    unsigned reg_id;
+    for (unsigned i = 0; i < regs.size(); i++) {
+      if (not regs[i]->empty()) {
+        if (ready and (*ready)->get_uid() < regs[i]->get_uid()) {
+          // ready is oldest
+        } else {
+          ready = &regs[i];
+          reg_id = i;
+        }
+      }
+    }
+    return reg_id;
+  }
+  unsigned get_schd_id(unsigned reg_id) {
+    assert(not regs[reg_id]->empty());
+    return regs[reg_id]->get_schd_id();
+  }
   void move_in(warp_inst_t *&src) {
     warp_inst_t **free = get_free();
     move_warp(*free, src);
@@ -1323,10 +1409,29 @@ class register_set {
   // void copy_in( warp_inst_t* src ){
   //   src->copy_contents_to(*get_free());
   //}
+  void move_in(bool sub_core_model, unsigned reg_id, warp_inst_t *&src) {
+    warp_inst_t **free;
+    if (!sub_core_model) {
+      free = get_free();
+    } else {
+      assert(reg_id < regs.size());
+      free = get_free(sub_core_model, reg_id);
+    }
+    move_warp(*free, src);
+  }
+
   void move_out_to(warp_inst_t *&dest) {
     warp_inst_t **ready = get_ready();
     move_warp(dest, *ready);
   }
+  void move_out_to(bool sub_core_model, unsigned reg_id, warp_inst_t *&dest) {
+    if (!sub_core_model) {
+      return move_out_to(dest);
+    }
+    warp_inst_t **ready = get_ready(sub_core_model, reg_id);
+    assert(ready != NULL);
+    move_warp(dest, *ready);
+  }
 
   warp_inst_t **get_ready() {
     warp_inst_t **ready;
@@ -1342,6 +1447,14 @@ class register_set {
     }
     return ready;
   }
+  warp_inst_t **get_ready(bool sub_core_model, unsigned reg_id) {
+    if (!sub_core_model) return get_ready();
+    warp_inst_t **ready;
+    ready = NULL;
+    assert(reg_id < regs.size());
+    if (not regs[reg_id]->empty()) ready = &regs[reg_id];
+    return ready;
+  }
 
   void print(FILE *fp) const {
     fprintf(fp, "%s : @%p\n", m_name, this);
@@ -1382,6 +1495,8 @@ class register_set {
   const char *m_name;
 };
 
+typedef std::map<unsigned, short> counterMap;
+
 #endif  // #ifdef __cplusplus
 
 #endif  // #ifndef ABSTRACT_HARDWARE_MODEL_INCLUDED
diff --git a/src/gpuwattch/Alpha21364.xml b/src/accelwattch/Alpha21364.xml
similarity index 100%
rename from src/gpuwattch/Alpha21364.xml
rename to src/accelwattch/Alpha21364.xml
diff --git a/src/gpuwattch/Niagara1.xml b/src/accelwattch/Niagara1.xml
similarity index 100%
rename from src/gpuwattch/Niagara1.xml
rename to src/accelwattch/Niagara1.xml
diff --git a/src/gpuwattch/Niagara1_sharing.xml b/src/accelwattch/Niagara1_sharing.xml
similarity index 100%
rename from src/gpuwattch/Niagara1_sharing.xml
rename to src/accelwattch/Niagara1_sharing.xml
diff --git a/src/gpuwattch/Niagara1_sharing_DC.xml b/src/accelwattch/Niagara1_sharing_DC.xml
similarity index 100%
rename from src/gpuwattch/Niagara1_sharing_DC.xml
rename to src/accelwattch/Niagara1_sharing_DC.xml
diff --git a/src/gpuwattch/Niagara1_sharing_SBT.xml b/src/accelwattch/Niagara1_sharing_SBT.xml
similarity index 100%
rename from src/gpuwattch/Niagara1_sharing_SBT.xml
rename to src/accelwattch/Niagara1_sharing_SBT.xml
diff --git a/src/gpuwattch/Niagara1_sharing_ST.xml b/src/accelwattch/Niagara1_sharing_ST.xml
similarity index 100%
rename from src/gpuwattch/Niagara1_sharing_ST.xml
rename to src/accelwattch/Niagara1_sharing_ST.xml
diff --git a/src/gpuwattch/Niagara2.xml b/src/accelwattch/Niagara2.xml
similarity index 100%
rename from src/gpuwattch/Niagara2.xml
rename to src/accelwattch/Niagara2.xml
diff --git a/src/gpuwattch/Penryn.xml b/src/accelwattch/Penryn.xml
similarity index 100%
rename from src/gpuwattch/Penryn.xml
rename to src/accelwattch/Penryn.xml
diff --git a/src/gpuwattch/README b/src/accelwattch/README
similarity index 100%
rename from src/gpuwattch/README
rename to src/accelwattch/README
diff --git a/src/gpuwattch/XML_Parse.cc b/src/accelwattch/XML_Parse.cc
similarity index 92%
rename from src/gpuwattch/XML_Parse.cc
rename to src/accelwattch/XML_Parse.cc
index 1b9a38ae1..eaec74806 100644
--- a/src/gpuwattch/XML_Parse.cc
+++ b/src/accelwattch/XML_Parse.cc
@@ -30,12 +30,14 @@
  ***************************************************************************/
 /********************************************************************
  *      Modified by:
- ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
- *University of Wisconsin–Madison                * Tayler Hetherington,
- *University of British Columbia         * Ahmed ElTantawy, University of
- *British Columbia             *
+ * Jingwen Leng, University of Texas, Austin                
+ * Syed Gilani, University of Wisconsin–Madison         
+ * Tayler Hetherington, University of British Columbia
+ * Ahmed ElTantawy, University of British Columbia
+ * Vijay Kandiah, Northwestern University
  ********************************************************************/
 
+
 #include "XML_Parse.h"
 #include <stdio.h>
 #include <string>
@@ -43,13 +45,14 @@
 
 using namespace std;
 
-const char* perf_count_label[] = {
-    "TOT_INST,",    "FP_INT,",  "IC_H,",     "IC_M,",        "DC_RH,",
-    "DC_RM,",       "DC_WH,",   "DC_WM,",    "TC_H,",        "TC_M,",
-    "CC_H,",        "CC_M,",    "SHRD_ACC,", "REG_RD,",      "REG_WR,",
-    "NON_REG_OPs,", "SP_ACC,",  "SFU_ACC,",  "FPU_ACC,",     "MEM_RD,",
-    "MEM_WR,",      "MEM_PRE,", "L2_RH,",    "L2_RM,",       "L2_WH,",
-    "L2_WM,",       "NOC_A,",   "PIPE_A,",   "IDLE_CORE_N,", "CONST_DYNAMICN"};
+const char * perf_count_label[] = {
+  "TOT_INST,", "FP_INT,", "IC_H,", "IC_M,", "DC_RH,", "DC_RM,", "DC_WH,", "DC_WM,",
+  "TC_H,", "TC_M,", "CC_H,", "CC_M,", "SHRD_ACC,", "REG_RD,", "REG_WR,", "NON_REG_OPs,",
+  "INT_ACC,", "FPU_ACC,", "DPU_ACC,", "INT_MUL24_ACC,", "INT_MUL32_ACC,", "INT_MUL_ACC,","INT_DIV_ACC,", 
+  "FP_MUL_ACC,", "FP_DIV_ACC,", "FP_SQRT_ACC,", "FP_LG_ACC,", "FP_SIN_ACC,", "FP_EXP_ACC,", "DP_MUL_ACC,", 
+  "DP_DIV_ACC,", "TENSOR_ACC,", "TEX_ACC,", "MEM_RD,","MEM_WR,", "MEM_PRE,", "L2_RH,", "L2_RM,", "L2_WH,",
+  "L2_WM,", "NOC_A,", "PIPE_A,", "IDLE_CORE_N,", "constant_power"};
+
 
 void ParseXML::parse(char* filepath) {
   unsigned int i, j, k, m, n;
@@ -160,6 +163,199 @@ void ParseXML::parse(char* filepath) {
           atoi(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "modeled_chip_voltage_ref") == 0) {
+      sys.modeled_chip_voltage_ref =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat1_flane") == 0) {
+      sys.static_cat1_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat2_flane") == 0) {
+      sys.static_cat2_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat3_flane") == 0) {
+      sys.static_cat3_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat4_flane") == 0) {
+      sys.static_cat4_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat5_flane") == 0) {
+      sys.static_cat5_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat6_flane") == 0) {
+      sys.static_cat6_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_shared_flane") == 0) {
+      sys.static_shared_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_l1_flane") == 0) {
+      sys.static_l1_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_l2_flane") == 0) {
+      sys.static_l2_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_light_flane") == 0) {
+      sys.static_light_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_intadd_flane") == 0) {
+      sys.static_intadd_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_intmul_flane") == 0) {
+      sys.static_intmul_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_geomean_flane") == 0) {
+      sys.static_geomean_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat1_addlane") == 0) {
+      sys.static_cat1_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat2_addlane") == 0) {
+      sys.static_cat2_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat3_addlane") == 0) {
+      sys.static_cat3_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat4_addlane") == 0) {
+      sys.static_cat4_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat5_addlane") == 0) {
+      sys.static_cat5_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat6_addlane") == 0) {
+      sys.static_cat6_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_shared_addlane") == 0) {
+      sys.static_shared_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_l1_addlane") == 0) {
+      sys.static_l1_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_l2_addlane") == 0) {
+      sys.static_l2_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_light_addlane") == 0) {
+      sys.static_light_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_intadd_addlane") == 0) {
+      sys.static_intadd_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_intmul_addlane") == 0) {
+      sys.static_intmul_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_geomean_addlane") == 0) {
+      sys.static_geomean_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+
     if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
                "target_chip_area") == 0) {
       sys.target_chip_area =
@@ -419,22 +615,106 @@ void ParseXML::parse(char* filepath) {
           atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
-               "SP_ACC") == 0) {
-      sys.scaling_coefficients[SP_ACC] =
-          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "INT_ACC")==0) {
+      sys.scaling_coefficients[INT_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
-               "SFU_ACC") == 0) {
-      sys.scaling_coefficients[SFU_ACC] =
-          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_ACC")==0) {
+      sys.scaling_coefficients[FP_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
-               "FPU_ACC") == 0) {
-      sys.scaling_coefficients[FPU_ACC] =
-          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "DP_ACC")==0) {
+      sys.scaling_coefficients[DP_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "INT_MUL24_ACC")==0) {
+      sys.scaling_coefficients[INT_MUL24_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "INT_MUL32_ACC")==0) {
+      sys.scaling_coefficients[INT_MUL32_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "INT_MUL_ACC")==0) {
+      sys.scaling_coefficients[INT_MUL_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "INT_DIV_ACC")==0) {
+      sys.scaling_coefficients[INT_DIV_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_MUL_ACC")==0) {
+      sys.scaling_coefficients[FP_MUL_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_DIV_ACC")==0) {
+      sys.scaling_coefficients[FP_DIV_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_SQRT_ACC")==0) {
+      sys.scaling_coefficients[FP_SQRT_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_LG_ACC")==0) {
+      sys.scaling_coefficients[FP_LG_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_SIN_ACC")==0) {
+      sys.scaling_coefficients[FP_SIN_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_EXP_ACC")==0) {
+      sys.scaling_coefficients[FP_EXP_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "DP_MUL_ACC")==0) {
+      sys.scaling_coefficients[DP_MUL_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "DP_DIV_ACC")==0) {
+      sys.scaling_coefficients[DP_DIV_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "TENSOR_ACC")==0) {
+      sys.scaling_coefficients[TENSOR_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "TEX_ACC")==0) {
+      sys.scaling_coefficients[TEX_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
       continue;
     }
     if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
@@ -498,8 +778,8 @@ void ParseXML::parse(char* filepath) {
       continue;
     }
     if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
-               "CONST_DYNAMICN") == 0) {
-      sys.scaling_coefficients[CONST_DYNAMICN] =
+               "constant_power") == 0) {
+      sys.scaling_coefficients[constant_power] =
           atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
@@ -4187,8 +4467,9 @@ void ParseXML::initialize()  // Initialize all
   // strcpy(sys.homogeneous_cores,"default");
   sys.core_tech_node = 1;
   sys.target_core_clockrate = 1;
+  sys.modeled_chip_voltage_ref = 1;
   sys.target_chip_area = 1;
-  sys.temperature = 1;
+  sys.temperature = 340;
   sys.number_cache_levels = 1;
   sys.homogeneous_cores = 1;
   sys.homogeneous_L1Directories = 1;
@@ -4198,6 +4479,34 @@ void ParseXML::initialize()  // Initialize all
   sys.homogeneous_NoCs = 1;
   sys.homogeneous_ccs = 1;
 
+  sys.static_cat1_flane = 0;
+  sys.static_cat2_flane = 0;
+  sys.static_cat3_flane = 0;
+  sys.static_cat4_flane = 0;
+  sys.static_cat5_flane = 0;
+  sys.static_cat6_flane = 0;
+  sys.static_shared_flane = 0;
+  sys.static_l1_flane = 0;
+  sys.static_l2_flane = 0;
+  sys.static_light_flane = 0;
+  sys.static_intadd_flane = 0;
+  sys.static_intmul_flane = 0;
+  sys.static_geomean_flane = 0;
+
+  sys.static_cat1_addlane = 0;
+  sys.static_cat2_addlane = 0;
+  sys.static_cat3_addlane = 0;
+  sys.static_cat4_addlane = 0;
+  sys.static_cat5_addlane = 0;
+  sys.static_cat6_addlane = 0;
+  sys.static_shared_addlane = 0;
+  sys.static_l1_addlane = 0;
+  sys.static_l2_addlane = 0;
+  sys.static_light_addlane = 0;
+  sys.static_intadd_addlane = 0;
+  sys.static_intmul_addlane = 0;
+  sys.static_geomean_addlane = 0;
+
   sys.Max_area_deviation = 1;
   sys.Max_power_deviation = 1;
   sys.device_type = 1;
diff --git a/src/gpuwattch/XML_Parse.h b/src/accelwattch/XML_Parse.h
similarity index 89%
rename from src/gpuwattch/XML_Parse.h
rename to src/accelwattch/XML_Parse.h
index 30c4e4b13..c82359faf 100644
--- a/src/gpuwattch/XML_Parse.h
+++ b/src/accelwattch/XML_Parse.h
@@ -30,10 +30,11 @@
  ***************************************************************************/
 /********************************************************************
  *      Modified by:
- ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
- *University of Wisconsin–Madison                * Tayler Hetherington,
- *University of British Columbia         * Ahmed ElTantawy, University of
- *British Columbia             *
+ * Jingwen Leng, University of Texas, Austin                
+ * Syed Gilani, University of Wisconsin–Madison         
+ * Tayler Hetherington, University of British Columbia
+ * Ahmed ElTantawy, University of British Columbia
+ * Vijay Kandiah, Northwestern University
  ********************************************************************/
 
 #ifndef XML_PARSE_H_
@@ -69,7 +70,7 @@ ToXMLStringTool tx,tx2;
 extern const char* perf_count_label[];
 
 enum perf_count_t {
-  TOT_INST = 0,
+  TOT_INST=0,
   FP_INT,
   IC_H,
   IC_M,
@@ -85,9 +86,23 @@ enum perf_count_t {
   REG_RD,
   REG_WR,
   NON_REG_OPs,
-  SP_ACC,
-  SFU_ACC,
-  FPU_ACC,
+  INT_ACC, //SPU
+  FP_ACC, //FPU
+  DP_ACC, //FPU
+  INT_MUL24_ACC, //SFU
+  INT_MUL32_ACC, //SFU
+  INT_MUL_ACC, //SFU 
+  INT_DIV_ACC, //SFU
+  FP_MUL_ACC, //SFU
+  FP_DIV_ACC, //SFU
+  FP_SQRT_ACC, //SFU
+  FP_LG_ACC, //SFU
+  FP_SIN_ACC, //SFU
+  FP_EXP_ACC, //SFU
+  DP_MUL_ACC, //SFU
+  DP_DIV_ACC, //SFU 
+  TENSOR_ACC, //SFU
+  TEX_ACC, //SFU 
   MEM_RD,
   MEM_WR,
   MEM_PRE,
@@ -98,7 +113,7 @@ enum perf_count_t {
   NOC_A,
   PIPE_A,
   IDLE_CORE_N,
-  CONST_DYNAMICN,
+  constant_power,
   NUM_PERFORMANCE_COUNTERS
 };
 
@@ -635,6 +650,33 @@ typedef struct {
   int homogeneous_L2Directories;
   double core_tech_node;
   int target_core_clockrate;
+  double modeled_chip_voltage_ref;
+  double static_cat1_flane;
+  double static_cat2_flane;
+  double static_cat3_flane;
+  double static_cat4_flane;
+  double static_cat5_flane;
+  double static_cat6_flane;
+  double static_shared_flane;
+  double static_l1_flane;
+  double static_l2_flane;
+  double static_light_flane;
+  double static_intadd_flane;
+  double static_intmul_flane;
+  double static_geomean_flane;
+  double static_cat1_addlane;
+  double static_cat2_addlane;
+  double static_cat3_addlane;
+  double static_cat4_addlane;
+  double static_cat5_addlane;
+  double static_cat6_addlane;
+  double static_shared_addlane;
+  double static_l1_addlane;
+  double static_l2_addlane;
+  double static_light_addlane;
+  double static_intadd_addlane;
+  double static_intmul_addlane;
+  double static_geomean_addlane;
   int target_chip_area;
   int temperature;
   int number_cache_levels;
diff --git a/src/gpuwattch/Xeon.xml b/src/accelwattch/Xeon.xml
similarity index 100%
rename from src/gpuwattch/Xeon.xml
rename to src/accelwattch/Xeon.xml
diff --git a/src/gpuwattch/arch_const.h b/src/accelwattch/arch_const.h
similarity index 100%
rename from src/gpuwattch/arch_const.h
rename to src/accelwattch/arch_const.h
diff --git a/src/gpuwattch/array.cc b/src/accelwattch/array.cc
similarity index 100%
rename from src/gpuwattch/array.cc
rename to src/accelwattch/array.cc
diff --git a/src/gpuwattch/array.h b/src/accelwattch/array.h
similarity index 100%
rename from src/gpuwattch/array.h
rename to src/accelwattch/array.h
diff --git a/src/gpuwattch/basic_components.cc b/src/accelwattch/basic_components.cc
similarity index 100%
rename from src/gpuwattch/basic_components.cc
rename to src/accelwattch/basic_components.cc
diff --git a/src/gpuwattch/basic_components.h b/src/accelwattch/basic_components.h
similarity index 100%
rename from src/gpuwattch/basic_components.h
rename to src/accelwattch/basic_components.h
diff --git a/src/gpuwattch/cacti/README b/src/accelwattch/cacti/README
similarity index 100%
rename from src/gpuwattch/cacti/README
rename to src/accelwattch/cacti/README
diff --git a/src/gpuwattch/cacti/Ucache.cc b/src/accelwattch/cacti/Ucache.cc
similarity index 99%
rename from src/gpuwattch/cacti/Ucache.cc
rename to src/accelwattch/cacti/Ucache.cc
index 8f733f73b..e92e67b91 100644
--- a/src/gpuwattch/cacti/Ucache.cc
+++ b/src/accelwattch/cacti/Ucache.cc
@@ -223,7 +223,7 @@ void * calc_time_mt_wrapper(void * void_obj)
   delete tag_arr.back();
   data_arr.pop_back();
   tag_arr.pop_back();
-
+  pthread_exit(NULL);
 }
 
 
@@ -246,7 +246,7 @@ bool calculate_time(
 {
   DynamicParameter dyn_p(is_tag, pure_ram, pure_cam, Nspd, Ndwl, Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2, is_main_mem);
 
-  if (dyn_p.is_valid == false)
+  if (dyn_p.is_valid != true)
   {
     return false;
   }
diff --git a/src/gpuwattch/cacti/Ucache.h b/src/accelwattch/cacti/Ucache.h
similarity index 100%
rename from src/gpuwattch/cacti/Ucache.h
rename to src/accelwattch/cacti/Ucache.h
diff --git a/src/gpuwattch/cacti/arbiter.cc b/src/accelwattch/cacti/arbiter.cc
similarity index 100%
rename from src/gpuwattch/cacti/arbiter.cc
rename to src/accelwattch/cacti/arbiter.cc
diff --git a/src/gpuwattch/cacti/arbiter.h b/src/accelwattch/cacti/arbiter.h
similarity index 100%
rename from src/gpuwattch/cacti/arbiter.h
rename to src/accelwattch/cacti/arbiter.h
diff --git a/src/gpuwattch/cacti/area.cc b/src/accelwattch/cacti/area.cc
similarity index 100%
rename from src/gpuwattch/cacti/area.cc
rename to src/accelwattch/cacti/area.cc
diff --git a/src/gpuwattch/cacti/area.h b/src/accelwattch/cacti/area.h
similarity index 100%
rename from src/gpuwattch/cacti/area.h
rename to src/accelwattch/cacti/area.h
diff --git a/src/gpuwattch/cacti/bank.cc b/src/accelwattch/cacti/bank.cc
similarity index 100%
rename from src/gpuwattch/cacti/bank.cc
rename to src/accelwattch/cacti/bank.cc
diff --git a/src/gpuwattch/cacti/bank.h b/src/accelwattch/cacti/bank.h
similarity index 100%
rename from src/gpuwattch/cacti/bank.h
rename to src/accelwattch/cacti/bank.h
diff --git a/src/gpuwattch/cacti/basic_circuit.cc b/src/accelwattch/cacti/basic_circuit.cc
similarity index 100%
rename from src/gpuwattch/cacti/basic_circuit.cc
rename to src/accelwattch/cacti/basic_circuit.cc
diff --git a/src/gpuwattch/cacti/basic_circuit.h b/src/accelwattch/cacti/basic_circuit.h
similarity index 100%
rename from src/gpuwattch/cacti/basic_circuit.h
rename to src/accelwattch/cacti/basic_circuit.h
diff --git a/src/gpuwattch/cacti/batch_tests b/src/accelwattch/cacti/batch_tests
similarity index 100%
rename from src/gpuwattch/cacti/batch_tests
rename to src/accelwattch/cacti/batch_tests
diff --git a/src/gpuwattch/cacti/cache.cfg b/src/accelwattch/cacti/cache.cfg
similarity index 100%
rename from src/gpuwattch/cacti/cache.cfg
rename to src/accelwattch/cacti/cache.cfg
diff --git a/src/gpuwattch/cacti/cacti.i b/src/accelwattch/cacti/cacti.i
similarity index 100%
rename from src/gpuwattch/cacti/cacti.i
rename to src/accelwattch/cacti/cacti.i
diff --git a/src/gpuwattch/cacti/cacti.mk b/src/accelwattch/cacti/cacti.mk
similarity index 96%
rename from src/gpuwattch/cacti/cacti.mk
rename to src/accelwattch/cacti/cacti.mk
index 7f3c57338..41f9218f4 100644
--- a/src/gpuwattch/cacti/cacti.mk
+++ b/src/accelwattch/cacti/cacti.mk
@@ -1,5 +1,5 @@
 
-OUTPUT_DIR=$(SIM_OBJ_FILES_DIR)/gpuwattch/cacti
+OUTPUT_DIR=$(SIM_OBJ_FILES_DIR)/accelwattch/cacti
 TARGET = cacti
 SHELL = /bin/sh
 .PHONY: all depend clean
diff --git a/src/gpuwattch/cacti/cacti_interface.cc b/src/accelwattch/cacti/cacti_interface.cc
similarity index 100%
rename from src/gpuwattch/cacti/cacti_interface.cc
rename to src/accelwattch/cacti/cacti_interface.cc
diff --git a/src/gpuwattch/cacti/cacti_interface.h b/src/accelwattch/cacti/cacti_interface.h
similarity index 100%
rename from src/gpuwattch/cacti/cacti_interface.h
rename to src/accelwattch/cacti/cacti_interface.h
diff --git a/src/gpuwattch/cacti/component.cc b/src/accelwattch/cacti/component.cc
similarity index 100%
rename from src/gpuwattch/cacti/component.cc
rename to src/accelwattch/cacti/component.cc
diff --git a/src/gpuwattch/cacti/component.h b/src/accelwattch/cacti/component.h
similarity index 100%
rename from src/gpuwattch/cacti/component.h
rename to src/accelwattch/cacti/component.h
diff --git a/src/gpuwattch/cacti/const.h b/src/accelwattch/cacti/const.h
similarity index 100%
rename from src/gpuwattch/cacti/const.h
rename to src/accelwattch/cacti/const.h
diff --git a/src/gpuwattch/cacti/contention.dat b/src/accelwattch/cacti/contention.dat
similarity index 100%
rename from src/gpuwattch/cacti/contention.dat
rename to src/accelwattch/cacti/contention.dat
diff --git a/src/gpuwattch/cacti/crossbar.cc b/src/accelwattch/cacti/crossbar.cc
similarity index 100%
rename from src/gpuwattch/cacti/crossbar.cc
rename to src/accelwattch/cacti/crossbar.cc
diff --git a/src/gpuwattch/cacti/crossbar.h b/src/accelwattch/cacti/crossbar.h
similarity index 100%
rename from src/gpuwattch/cacti/crossbar.h
rename to src/accelwattch/cacti/crossbar.h
diff --git a/src/gpuwattch/cacti/decoder.cc b/src/accelwattch/cacti/decoder.cc
similarity index 100%
rename from src/gpuwattch/cacti/decoder.cc
rename to src/accelwattch/cacti/decoder.cc
diff --git a/src/gpuwattch/cacti/decoder.h b/src/accelwattch/cacti/decoder.h
similarity index 100%
rename from src/gpuwattch/cacti/decoder.h
rename to src/accelwattch/cacti/decoder.h
diff --git a/src/gpuwattch/cacti/highradix.cc b/src/accelwattch/cacti/highradix.cc
similarity index 100%
rename from src/gpuwattch/cacti/highradix.cc
rename to src/accelwattch/cacti/highradix.cc
diff --git a/src/gpuwattch/cacti/highradix.h b/src/accelwattch/cacti/highradix.h
similarity index 100%
rename from src/gpuwattch/cacti/highradix.h
rename to src/accelwattch/cacti/highradix.h
diff --git a/src/gpuwattch/cacti/htree2.cc b/src/accelwattch/cacti/htree2.cc
similarity index 100%
rename from src/gpuwattch/cacti/htree2.cc
rename to src/accelwattch/cacti/htree2.cc
diff --git a/src/gpuwattch/cacti/htree2.h b/src/accelwattch/cacti/htree2.h
similarity index 100%
rename from src/gpuwattch/cacti/htree2.h
rename to src/accelwattch/cacti/htree2.h
diff --git a/src/gpuwattch/cacti/io.cc b/src/accelwattch/cacti/io.cc
similarity index 100%
rename from src/gpuwattch/cacti/io.cc
rename to src/accelwattch/cacti/io.cc
diff --git a/src/gpuwattch/cacti/io.h b/src/accelwattch/cacti/io.h
similarity index 100%
rename from src/gpuwattch/cacti/io.h
rename to src/accelwattch/cacti/io.h
diff --git a/src/gpuwattch/cacti/main.cc b/src/accelwattch/cacti/main.cc
similarity index 100%
rename from src/gpuwattch/cacti/main.cc
rename to src/accelwattch/cacti/main.cc
diff --git a/src/gpuwattch/cacti/makefile b/src/accelwattch/cacti/makefile
similarity index 100%
rename from src/gpuwattch/cacti/makefile
rename to src/accelwattch/cacti/makefile
diff --git a/src/gpuwattch/cacti/mat.cc b/src/accelwattch/cacti/mat.cc
similarity index 100%
rename from src/gpuwattch/cacti/mat.cc
rename to src/accelwattch/cacti/mat.cc
diff --git a/src/gpuwattch/cacti/mat.h b/src/accelwattch/cacti/mat.h
similarity index 100%
rename from src/gpuwattch/cacti/mat.h
rename to src/accelwattch/cacti/mat.h
diff --git a/src/gpuwattch/cacti/nuca.cc b/src/accelwattch/cacti/nuca.cc
similarity index 100%
rename from src/gpuwattch/cacti/nuca.cc
rename to src/accelwattch/cacti/nuca.cc
diff --git a/src/gpuwattch/cacti/nuca.h b/src/accelwattch/cacti/nuca.h
similarity index 100%
rename from src/gpuwattch/cacti/nuca.h
rename to src/accelwattch/cacti/nuca.h
diff --git a/src/gpuwattch/cacti/out_batch_test_result.csv b/src/accelwattch/cacti/out_batch_test_result.csv
similarity index 100%
rename from src/gpuwattch/cacti/out_batch_test_result.csv
rename to src/accelwattch/cacti/out_batch_test_result.csv
diff --git a/src/gpuwattch/cacti/parameter.cc b/src/accelwattch/cacti/parameter.cc
similarity index 100%
rename from src/gpuwattch/cacti/parameter.cc
rename to src/accelwattch/cacti/parameter.cc
diff --git a/src/gpuwattch/cacti/parameter.h b/src/accelwattch/cacti/parameter.h
similarity index 100%
rename from src/gpuwattch/cacti/parameter.h
rename to src/accelwattch/cacti/parameter.h
diff --git a/src/gpuwattch/cacti/router.cc b/src/accelwattch/cacti/router.cc
similarity index 100%
rename from src/gpuwattch/cacti/router.cc
rename to src/accelwattch/cacti/router.cc
diff --git a/src/gpuwattch/cacti/router.h b/src/accelwattch/cacti/router.h
similarity index 100%
rename from src/gpuwattch/cacti/router.h
rename to src/accelwattch/cacti/router.h
diff --git a/src/gpuwattch/cacti/subarray.cc b/src/accelwattch/cacti/subarray.cc
similarity index 100%
rename from src/gpuwattch/cacti/subarray.cc
rename to src/accelwattch/cacti/subarray.cc
diff --git a/src/gpuwattch/cacti/subarray.h b/src/accelwattch/cacti/subarray.h
similarity index 100%
rename from src/gpuwattch/cacti/subarray.h
rename to src/accelwattch/cacti/subarray.h
diff --git a/src/gpuwattch/cacti/technology.cc b/src/accelwattch/cacti/technology.cc
similarity index 100%
rename from src/gpuwattch/cacti/technology.cc
rename to src/accelwattch/cacti/technology.cc
diff --git a/src/gpuwattch/cacti/uca.cc b/src/accelwattch/cacti/uca.cc
similarity index 100%
rename from src/gpuwattch/cacti/uca.cc
rename to src/accelwattch/cacti/uca.cc
diff --git a/src/gpuwattch/cacti/uca.h b/src/accelwattch/cacti/uca.h
similarity index 100%
rename from src/gpuwattch/cacti/uca.h
rename to src/accelwattch/cacti/uca.h
diff --git a/src/gpuwattch/cacti/wire.cc b/src/accelwattch/cacti/wire.cc
similarity index 100%
rename from src/gpuwattch/cacti/wire.cc
rename to src/accelwattch/cacti/wire.cc
diff --git a/src/gpuwattch/cacti/wire.h b/src/accelwattch/cacti/wire.h
similarity index 100%
rename from src/gpuwattch/cacti/wire.h
rename to src/accelwattch/cacti/wire.h
diff --git a/src/gpuwattch/core.cc b/src/accelwattch/core.cc
similarity index 100%
rename from src/gpuwattch/core.cc
rename to src/accelwattch/core.cc
diff --git a/src/gpuwattch/core.h b/src/accelwattch/core.h
similarity index 100%
rename from src/gpuwattch/core.h
rename to src/accelwattch/core.h
diff --git a/src/gpuwattch/fermi.xml b/src/accelwattch/fermi.xml
similarity index 100%
rename from src/gpuwattch/fermi.xml
rename to src/accelwattch/fermi.xml
diff --git a/src/gpuwattch/globalvar.h b/src/accelwattch/globalvar.h
similarity index 100%
rename from src/gpuwattch/globalvar.h
rename to src/accelwattch/globalvar.h
diff --git a/src/gpuwattch/gpgpu.xml b/src/accelwattch/gpgpu.xml
similarity index 100%
rename from src/gpuwattch/gpgpu.xml
rename to src/accelwattch/gpgpu.xml
diff --git a/src/gpuwattch/gpgpu_sim.verify b/src/accelwattch/gpgpu_sim.verify
similarity index 100%
rename from src/gpuwattch/gpgpu_sim.verify
rename to src/accelwattch/gpgpu_sim.verify
diff --git a/src/accelwattch/gpgpu_sim_wrapper.cc b/src/accelwattch/gpgpu_sim_wrapper.cc
new file mode 100644
index 000000000..67d9daa1f
--- /dev/null
+++ b/src/accelwattch/gpgpu_sim_wrapper.cc
@@ -0,0 +1,1143 @@
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+#include "gpgpu_sim_wrapper.h"
+#include <sys/stat.h>
+#define SP_BASE_POWER 0
+#define SFU_BASE_POWER 0
+
+static const char* pwr_cmp_label[] = {
+    "IBP,", "ICP,", "DCP,", "TCP,", "CCP,", "SHRDP,", "RFP,", "INTP,", 
+    "FPUP,", "DPUP,", "INT_MUL24P,", "INT_MUL32P,", "INT_MULP,", "INT_DIVP,", 
+    "FP_MULP,", "FP_DIVP,", "FP_SQRTP,", "FP_LGP,", "FP_SINP,", "FP_EXP,", 
+    "DP_MULP,", "DP_DIVP,", "TENSORP,", "TEXP,", "SCHEDP,", "L2CP,", "MCP,", "NOCP,", 
+    "DRAMP,", "PIPEP,", "IDLE_COREP,", "CONSTP", "STATICP"};
+
+enum pwr_cmp_t {
+  IBP=0,
+  ICP,
+  DCP,
+  TCP,
+  CCP,
+  SHRDP,
+  RFP,
+  INTP,
+  FPUP,
+  DPUP,
+  INT_MUL24P,
+  INT_MUL32P,
+  INT_MULP,
+  INT_DIVP,
+  FP_MULP,
+  FP_DIVP,
+  FP_SQRTP,
+  FP_LGP,
+  FP_SINP,
+  FP_EXP,
+  DP_MULP,
+  DP_DIVP,
+  TENSORP,
+  TEXP,
+  SCHEDP,
+  L2CP,
+  MCP,
+  NOCP,
+  DRAMP,
+  PIPEP,
+  IDLE_COREP,
+  CONSTP,
+  STATICP,
+  NUM_COMPONENTS_MODELLED
+};
+
+gpgpu_sim_wrapper::gpgpu_sim_wrapper(bool power_simulation_enabled,
+                                     char* xmlfile, int power_simulation_mode, bool dvfs_enabled) {
+  kernel_sample_count = 0;
+  total_sample_count = 0;
+
+  kernel_tot_power = 0;
+  avg_threads_per_warp_tot = 0;
+  num_pwr_cmps = NUM_COMPONENTS_MODELLED;
+  num_perf_counters = NUM_PERFORMANCE_COUNTERS;
+
+  // Initialize per-component counter/power vectors
+  avg_max_min_counters<double> init;
+  kernel_cmp_pwr.resize(NUM_COMPONENTS_MODELLED, init);
+  kernel_cmp_perf_counters.resize(NUM_PERFORMANCE_COUNTERS, init);
+
+  kernel_power = init;   // Per-kernel powers
+  gpu_tot_power = init;  // Global powers
+
+  sample_cmp_pwr.resize(NUM_COMPONENTS_MODELLED, 0);
+
+  sample_perf_counters.resize(NUM_PERFORMANCE_COUNTERS, 0);
+  initpower_coeff.resize(NUM_PERFORMANCE_COUNTERS, 0);
+  effpower_coeff.resize(NUM_PERFORMANCE_COUNTERS, 0);
+
+  const_dynamic_power = 0;
+  proc_power = 0;
+
+  g_power_filename = NULL;
+  g_power_trace_filename = NULL;
+  g_metric_trace_filename = NULL;
+  g_steady_state_tracking_filename = NULL;
+  xml_filename = xmlfile;
+  g_power_simulation_enabled = power_simulation_enabled;
+  g_power_simulation_mode = power_simulation_mode;
+  g_dvfs_enabled = dvfs_enabled;
+  g_power_trace_enabled = false;
+  g_steady_power_levels_enabled = false;
+  g_power_trace_zlevel = 0;
+  g_power_per_cycle_dump = false;
+  gpu_steady_power_deviation = 0;
+  gpu_steady_min_period = 0;
+
+  gpu_stat_sample_freq = 0;
+  p = new ParseXML();
+  if (g_power_simulation_enabled) {
+    p->parse(xml_filename);
+  }
+  proc = new Processor(p);
+  power_trace_file = NULL;
+  metric_trace_file = NULL;
+  steady_state_tacking_file = NULL;
+  has_written_avg = false;
+  init_inst_val = false;
+}
+
+gpgpu_sim_wrapper::~gpgpu_sim_wrapper() {}
+
+bool gpgpu_sim_wrapper::sanity_check(double a, double b) {
+  if (b == 0)
+    return (abs(a - b) < 0.00001);
+  else
+    return (abs(a - b) / abs(b) < 0.00001);
+
+  return false;
+}
+void gpgpu_sim_wrapper::init_mcpat_hw_mode(unsigned gpu_sim_cycle) {
+   p->sys.total_cycles = gpu_sim_cycle; //total simulated cycles for current kernel
+}
+
+void gpgpu_sim_wrapper::init_mcpat(
+    char* xmlfile, char* powerfilename, char* power_trace_filename,
+    char* metric_trace_filename, char* steady_state_filename,
+    bool power_sim_enabled, bool trace_enabled, bool steady_state_enabled,
+    bool power_per_cycle_dump, double steady_power_deviation,
+    double steady_min_period, int zlevel, double init_val,
+    int stat_sample_freq, int power_sim_mode, bool dvfs_enabled,
+    unsigned clock_freq, unsigned num_shaders) {
+  // Write File Headers for (-metrics trace, -power trace)
+
+  reset_counters();
+  static bool mcpat_init = true;
+
+  // initialize file name if it is not set
+  time_t curr_time;
+  time(&curr_time);
+  char* date = ctime(&curr_time);
+  char* s = date;
+  while (*s) {
+    if (*s == ' ' || *s == '\t' || *s == ':') *s = '-';
+    if (*s == '\n' || *s == '\r') *s = 0;
+    s++;
+  }
+
+  if (mcpat_init) {
+    g_power_filename = powerfilename;
+    g_power_trace_filename = power_trace_filename;
+    g_metric_trace_filename = metric_trace_filename;
+    g_steady_state_tracking_filename = steady_state_filename;
+    xml_filename = xmlfile;
+    g_power_simulation_enabled = power_sim_enabled;
+    g_power_simulation_mode = power_sim_mode;
+    g_dvfs_enabled = dvfs_enabled;
+    g_power_trace_enabled = trace_enabled;
+    g_steady_power_levels_enabled = steady_state_enabled;
+    g_power_trace_zlevel = zlevel;
+    g_power_per_cycle_dump = power_per_cycle_dump;
+    gpu_steady_power_deviation = steady_power_deviation;
+    gpu_steady_min_period = steady_min_period;
+
+    gpu_stat_sample_freq = stat_sample_freq;
+
+    // p->sys.total_cycles=gpu_stat_sample_freq*4;
+    p->sys.total_cycles = gpu_stat_sample_freq;
+    p->sys.target_core_clockrate = clock_freq;
+    p->sys.number_of_cores = num_shaders;
+    p->sys.core[0].clock_rate = clock_freq;
+    power_trace_file = NULL;
+    metric_trace_file = NULL;
+    steady_state_tacking_file = NULL;
+
+    if (g_power_trace_enabled) {
+      power_trace_file = gzopen(g_power_trace_filename, "w");
+      metric_trace_file = gzopen(g_metric_trace_filename, "w");
+      if ((power_trace_file == NULL) || (metric_trace_file == NULL)) {
+        printf("error - could not open trace files \n");
+        exit(1);
+      }
+      gzsetparams(power_trace_file, g_power_trace_zlevel, Z_DEFAULT_STRATEGY);
+
+      gzprintf(power_trace_file, "power,");
+      for (unsigned i = 0; i < num_pwr_cmps; i++) {
+        gzprintf(power_trace_file, pwr_cmp_label[i]);
+      }
+      gzprintf(power_trace_file, "\n");
+
+      gzsetparams(metric_trace_file, g_power_trace_zlevel, Z_DEFAULT_STRATEGY);
+      for (unsigned i = 0; i < num_perf_counters; i++) {
+        gzprintf(metric_trace_file, perf_count_label[i]);
+      }
+      gzprintf(metric_trace_file, "\n");
+
+      gzclose(power_trace_file);
+      gzclose(metric_trace_file);
+    }
+    if (g_steady_power_levels_enabled) {
+      steady_state_tacking_file = gzopen(g_steady_state_tracking_filename, "w");
+      if ((steady_state_tacking_file == NULL)) {
+        printf("error - could not open trace files \n");
+        exit(1);
+      }
+      gzsetparams(steady_state_tacking_file, g_power_trace_zlevel,
+                  Z_DEFAULT_STRATEGY);
+      gzprintf(steady_state_tacking_file, "start,end,power,IPC,");
+      for (unsigned i = 0; i < num_perf_counters; i++) {
+        gzprintf(steady_state_tacking_file, perf_count_label[i]);
+      }
+      gzprintf(steady_state_tacking_file, "\n");
+
+      gzclose(steady_state_tacking_file);
+    }
+
+    mcpat_init = false;
+    has_written_avg = false;
+    powerfile.open(g_power_filename);
+    int flg = chmod(g_power_filename, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+    assert(flg == 0);
+  }
+  sample_val = 0;
+  init_inst_val = init_val;  // gpu_tot_sim_insn+gpu_sim_insn;
+}
+
+void gpgpu_sim_wrapper::reset_counters() {
+  avg_max_min_counters<double> init;
+  for (unsigned i = 0; i < num_perf_counters; ++i) {
+    sample_perf_counters[i] = 0;
+    kernel_cmp_perf_counters[i] = init;
+  }
+  for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+    sample_cmp_pwr[i] = 0;
+    kernel_cmp_pwr[i] = init;
+  }
+
+  // Reset per-kernel counters
+  kernel_sample_count = 0;
+  kernel_tot_power = 0;
+  kernel_power = init;
+  avg_threads_per_warp_tot = 0;
+  return;
+}
+
+void gpgpu_sim_wrapper::set_inst_power(bool clk_gated_lanes, double tot_cycles,
+                                       double busy_cycles, double tot_inst,
+                                       double int_inst, double fp_inst,
+                                       double load_inst, double store_inst,
+                                       double committed_inst) {
+  p->sys.core[0].gpgpu_clock_gated_lanes = clk_gated_lanes;
+  p->sys.core[0].total_cycles = tot_cycles;
+  p->sys.core[0].busy_cycles = busy_cycles;
+  p->sys.core[0].total_instructions =
+      tot_inst * p->sys.scaling_coefficients[TOT_INST];
+  p->sys.core[0].int_instructions =
+      int_inst * p->sys.scaling_coefficients[FP_INT];
+  p->sys.core[0].fp_instructions =
+      fp_inst * p->sys.scaling_coefficients[FP_INT];
+  p->sys.core[0].load_instructions = load_inst;
+  p->sys.core[0].store_instructions = store_inst;
+  p->sys.core[0].committed_instructions = committed_inst;
+  sample_perf_counters[FP_INT] = int_inst + fp_inst;
+  sample_perf_counters[TOT_INST] = tot_inst;
+}
+
+void gpgpu_sim_wrapper::set_regfile_power(double reads, double writes,
+                                          double ops) {
+  p->sys.core[0].int_regfile_reads =
+      reads * p->sys.scaling_coefficients[REG_RD];
+  p->sys.core[0].int_regfile_writes =
+      writes * p->sys.scaling_coefficients[REG_WR];
+  p->sys.core[0].non_rf_operands =
+      ops * p->sys.scaling_coefficients[NON_REG_OPs];
+  sample_perf_counters[REG_RD] = reads;
+  sample_perf_counters[REG_WR] = writes;
+  sample_perf_counters[NON_REG_OPs] = ops;
+}
+
+void gpgpu_sim_wrapper::set_icache_power(double hits, double misses) {
+  p->sys.core[0].icache.read_accesses =
+      hits * p->sys.scaling_coefficients[IC_H] +
+      misses * p->sys.scaling_coefficients[IC_M];
+  p->sys.core[0].icache.read_misses =
+      misses * p->sys.scaling_coefficients[IC_M];
+  sample_perf_counters[IC_H] = hits;
+  sample_perf_counters[IC_M] = misses;
+}
+
+void gpgpu_sim_wrapper::set_ccache_power(double hits, double misses) {
+  p->sys.core[0].ccache.read_accesses =
+      hits * p->sys.scaling_coefficients[CC_H] +
+      misses * p->sys.scaling_coefficients[CC_M];
+  p->sys.core[0].ccache.read_misses =
+      misses * p->sys.scaling_coefficients[CC_M];
+  sample_perf_counters[CC_H] = hits;
+  sample_perf_counters[CC_M] = misses;
+  // TODO: coalescing logic is counted as part of the caches power (this is not
+  // valid for no-caches architectures)
+}
+
+void gpgpu_sim_wrapper::set_tcache_power(double hits, double misses) {
+  p->sys.core[0].tcache.read_accesses =
+      hits * p->sys.scaling_coefficients[TC_H] +
+      misses * p->sys.scaling_coefficients[TC_M];
+  p->sys.core[0].tcache.read_misses =
+      misses * p->sys.scaling_coefficients[TC_M];
+  sample_perf_counters[TC_H] = hits;
+  sample_perf_counters[TC_M] = misses;
+  // TODO: coalescing logic is counted as part of the caches power (this is not
+  // valid for no-caches architectures)
+}
+
+void gpgpu_sim_wrapper::set_shrd_mem_power(double accesses) {
+  p->sys.core[0].sharedmemory.read_accesses =
+      accesses * p->sys.scaling_coefficients[SHRD_ACC];
+  sample_perf_counters[SHRD_ACC] = accesses;
+}
+
+void gpgpu_sim_wrapper::set_l1cache_power(double read_hits, double read_misses,
+                                          double write_hits,
+                                          double write_misses) {
+  p->sys.core[0].dcache.read_accesses =
+      read_hits * p->sys.scaling_coefficients[DC_RH] +
+      read_misses * p->sys.scaling_coefficients[DC_RM];
+  p->sys.core[0].dcache.read_misses =
+      read_misses * p->sys.scaling_coefficients[DC_RM];
+  p->sys.core[0].dcache.write_accesses =
+      write_hits * p->sys.scaling_coefficients[DC_WH] +
+      write_misses * p->sys.scaling_coefficients[DC_WM];
+  p->sys.core[0].dcache.write_misses =
+      write_misses * p->sys.scaling_coefficients[DC_WM];
+  sample_perf_counters[DC_RH] = read_hits;
+  sample_perf_counters[DC_RM] = read_misses;
+  sample_perf_counters[DC_WH] = write_hits;
+  sample_perf_counters[DC_WM] = write_misses;
+  // TODO: coalescing logic is counted as part of the caches power (this is not
+  // valid for no-caches architectures)
+}
+
+void gpgpu_sim_wrapper::set_l2cache_power(double read_hits, double read_misses,
+                                          double write_hits,
+                                          double write_misses) {
+  p->sys.l2.total_accesses = read_hits * p->sys.scaling_coefficients[L2_RH] +
+                             read_misses * p->sys.scaling_coefficients[L2_RM] +
+                             write_hits * p->sys.scaling_coefficients[L2_WH] +
+                             write_misses * p->sys.scaling_coefficients[L2_WM];
+  p->sys.l2.read_accesses = read_hits * p->sys.scaling_coefficients[L2_RH] +
+                            read_misses * p->sys.scaling_coefficients[L2_RM];
+  p->sys.l2.write_accesses = write_hits * p->sys.scaling_coefficients[L2_WH] +
+                             write_misses * p->sys.scaling_coefficients[L2_WM];
+  p->sys.l2.read_hits = read_hits * p->sys.scaling_coefficients[L2_RH];
+  p->sys.l2.read_misses = read_misses * p->sys.scaling_coefficients[L2_RM];
+  p->sys.l2.write_hits = write_hits * p->sys.scaling_coefficients[L2_WH];
+  p->sys.l2.write_misses = write_misses * p->sys.scaling_coefficients[L2_WM];
+  sample_perf_counters[L2_RH] = read_hits;
+  sample_perf_counters[L2_RM] = read_misses;
+  sample_perf_counters[L2_WH] = write_hits;
+  sample_perf_counters[L2_WM] = write_misses;
+}
+
+void gpgpu_sim_wrapper::set_num_cores(double num_core) {
+  
+  num_cores = num_core;
+}
+
+void gpgpu_sim_wrapper::set_idle_core_power(double num_idle_core) {
+  p->sys.num_idle_cores = num_idle_core;
+  sample_perf_counters[IDLE_CORE_N] = num_idle_core;
+  num_idle_cores = num_idle_core;
+}
+
+void gpgpu_sim_wrapper::set_duty_cycle_power(double duty_cycle) {
+  p->sys.core[0].pipeline_duty_cycle =
+      duty_cycle * p->sys.scaling_coefficients[PIPE_A];
+  sample_perf_counters[PIPE_A] = duty_cycle;
+}
+
+void gpgpu_sim_wrapper::set_mem_ctrl_power(double reads, double writes,
+                                           double dram_precharge) {
+  p->sys.mc.memory_accesses = reads * p->sys.scaling_coefficients[MEM_RD] +
+                              writes * p->sys.scaling_coefficients[MEM_WR];
+  p->sys.mc.memory_reads = reads * p->sys.scaling_coefficients[MEM_RD];
+  p->sys.mc.memory_writes = writes * p->sys.scaling_coefficients[MEM_WR];
+  p->sys.mc.dram_pre = dram_precharge * p->sys.scaling_coefficients[MEM_PRE];
+  sample_perf_counters[MEM_RD] = reads;
+  sample_perf_counters[MEM_WR] = writes;
+  sample_perf_counters[MEM_PRE] = dram_precharge;
+}
+
+
+void gpgpu_sim_wrapper::set_model_voltage(double model_voltage) {
+	modeled_chip_voltage = model_voltage;
+}
+
+
+void gpgpu_sim_wrapper::set_exec_unit_power(double fpu_accesses,
+                                            double ialu_accesses,
+                                            double sfu_accesses) {
+  p->sys.core[0].fpu_accesses = fpu_accesses;
+  tot_fpu_accesses = fpu_accesses;
+  //Integer ALU (not present in Tesla)
+  p->sys.core[0].ialu_accesses = ialu_accesses;
+
+  //Sfu accesses
+  p->sys.core[0].mul_accesses = sfu_accesses;
+  tot_sfu_accesses = sfu_accesses;
+}
+
+PowerscalingCoefficients * gpgpu_sim_wrapper::get_scaling_coeffs()
+{
+
+  PowerscalingCoefficients * scalingCoeffs = new PowerscalingCoefficients();
+
+  scalingCoeffs->int_coeff = p->sys.scaling_coefficients[INT_ACC];
+  scalingCoeffs->int_mul_coeff = p->sys.scaling_coefficients[INT_MUL_ACC];
+  scalingCoeffs->int_mul24_coeff = p->sys.scaling_coefficients[INT_MUL24_ACC];
+  scalingCoeffs->int_mul32_coeff = p->sys.scaling_coefficients[INT_MUL32_ACC];
+  scalingCoeffs->int_div_coeff = p->sys.scaling_coefficients[INT_DIV_ACC];
+  scalingCoeffs->fp_coeff = p->sys.scaling_coefficients[FP_ACC];
+  scalingCoeffs->dp_coeff = p->sys.scaling_coefficients[DP_ACC];
+  scalingCoeffs->fp_mul_coeff = p->sys.scaling_coefficients[FP_MUL_ACC];
+  scalingCoeffs->fp_div_coeff = p->sys.scaling_coefficients[FP_DIV_ACC];
+  scalingCoeffs->dp_mul_coeff = p->sys.scaling_coefficients[DP_MUL_ACC];
+  scalingCoeffs->dp_div_coeff = p->sys.scaling_coefficients[DP_DIV_ACC];
+  scalingCoeffs->sqrt_coeff = p->sys.scaling_coefficients[FP_SQRT_ACC];
+  scalingCoeffs->log_coeff = p->sys.scaling_coefficients[FP_LG_ACC];
+  scalingCoeffs->sin_coeff = p->sys.scaling_coefficients[FP_SIN_ACC];
+  scalingCoeffs->exp_coeff = p->sys.scaling_coefficients[FP_EXP_ACC];
+  scalingCoeffs->tensor_coeff = p->sys.scaling_coefficients[TENSOR_ACC];
+  scalingCoeffs->tex_coeff = p->sys.scaling_coefficients[TEX_ACC];
+  return scalingCoeffs;
+
+}
+
+void gpgpu_sim_wrapper::set_int_accesses(double ialu_accesses, 
+                                        double imul24_accesses, 
+                                        double imul32_accesses, 
+                                        double imul_accesses, 
+                                        double idiv_accesses)
+{
+
+  sample_perf_counters[INT_ACC]=ialu_accesses;
+  sample_perf_counters[INT_MUL24_ACC]=imul24_accesses;
+  sample_perf_counters[INT_MUL32_ACC]=imul32_accesses;
+  sample_perf_counters[INT_MUL_ACC]=imul_accesses;
+  sample_perf_counters[INT_DIV_ACC]=idiv_accesses;
+}
+
+void gpgpu_sim_wrapper::set_dp_accesses(double dpu_accesses, 
+                                        double dpmul_accesses, 
+                                        double dpdiv_accesses)
+{
+  sample_perf_counters[DP_ACC]=dpu_accesses;
+  sample_perf_counters[DP_MUL_ACC]=dpmul_accesses;
+  sample_perf_counters[DP_DIV_ACC]=dpdiv_accesses;
+}
+
+void gpgpu_sim_wrapper::set_fp_accesses(double fpu_accesses, 
+                                        double fpmul_accesses, 
+                                        double fpdiv_accesses)
+{
+  sample_perf_counters[FP_ACC]=fpu_accesses;
+  sample_perf_counters[FP_MUL_ACC]=fpmul_accesses;
+  sample_perf_counters[FP_DIV_ACC]=fpdiv_accesses;
+}
+
+void gpgpu_sim_wrapper::set_trans_accesses(double sqrt_accesses, 
+                                           double log_accesses, 
+                                           double sin_accesses, 
+                                           double exp_accesses)
+{
+
+  sample_perf_counters[FP_SQRT_ACC]=sqrt_accesses;
+  sample_perf_counters[FP_LG_ACC]=log_accesses;
+  sample_perf_counters[FP_SIN_ACC]=sin_accesses;
+  sample_perf_counters[FP_EXP_ACC]=exp_accesses;
+
+}
+
+void gpgpu_sim_wrapper::set_tensor_accesses(double tensor_accesses)
+{
+  sample_perf_counters[TENSOR_ACC]=tensor_accesses;
+
+}
+
+void gpgpu_sim_wrapper::set_tex_accesses(double tex_accesses)
+{
+  sample_perf_counters[TEX_ACC]=tex_accesses;
+
+}
+
+void gpgpu_sim_wrapper::set_avg_active_threads(float active_threads)
+{
+  avg_threads_per_warp = (unsigned)ceil(active_threads);
+  avg_threads_per_warp_tot += active_threads;
+}
+
+void gpgpu_sim_wrapper::set_active_lanes_power(double sp_avg_active_lane,
+                                               double sfu_avg_active_lane) {
+  p->sys.core[0].sp_average_active_lanes = sp_avg_active_lane;
+  p->sys.core[0].sfu_average_active_lanes = sfu_avg_active_lane;
+}
+
+void gpgpu_sim_wrapper::set_NoC_power(double noc_tot_acc) {
+  p->sys.NoC[0].total_accesses =
+      noc_tot_acc * p->sys.scaling_coefficients[NOC_A];
+  sample_perf_counters[NOC_A] = noc_tot_acc;
+}
+
+void gpgpu_sim_wrapper::power_metrics_calculations() {
+  total_sample_count++;
+  kernel_sample_count++;
+
+  // Current sample power
+  double sample_power = proc->rt_power.readOp.dynamic + sample_cmp_pwr[CONSTP] + sample_cmp_pwr[STATICP];
+  // double sample_power;
+  // for(unsigned i=0; i<num_pwr_cmps; i++){
+  //   sample_power+=sample_cmp_pwr[i]; //fix for dvfs
+  // }
+
+  // Average power
+  // Previous + new + constant dynamic power (e.g., dynamic clocking power)
+  kernel_tot_power += sample_power;
+  kernel_power.avg = kernel_tot_power / kernel_sample_count;
+  for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
+    kernel_cmp_pwr[ind].avg += (double)sample_cmp_pwr[ind];
+  }
+
+  for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
+    kernel_cmp_perf_counters[ind].avg += (double)sample_perf_counters[ind];
+  }
+
+  // Max Power
+  if (sample_power > kernel_power.max) {
+    kernel_power.max = sample_power;
+    for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
+      kernel_cmp_pwr[ind].max = (double)sample_cmp_pwr[ind];
+    }
+    for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
+      kernel_cmp_perf_counters[ind].max = sample_perf_counters[ind];
+    }
+  }
+
+  // Min Power
+  if (sample_power < kernel_power.min || (kernel_power.min == 0)) {
+    kernel_power.min = sample_power;
+    for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
+      kernel_cmp_pwr[ind].min = (double)sample_cmp_pwr[ind];
+    }
+    for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
+      kernel_cmp_perf_counters[ind].min = sample_perf_counters[ind];
+    }
+  }
+
+  gpu_tot_power.avg = (gpu_tot_power.avg + sample_power);
+  gpu_tot_power.max =
+      (sample_power > gpu_tot_power.max) ? sample_power : gpu_tot_power.max;
+  gpu_tot_power.min =
+      ((sample_power < gpu_tot_power.min) || (gpu_tot_power.min == 0))
+          ? sample_power
+          : gpu_tot_power.min;
+}
+
+void gpgpu_sim_wrapper::print_trace_files() {
+  open_files();
+
+  for (unsigned i = 0; i < num_perf_counters; ++i) {
+    gzprintf(metric_trace_file, "%f,", sample_perf_counters[i]);
+  }
+  gzprintf(metric_trace_file, "\n");
+
+  gzprintf(power_trace_file, "%f,", proc_power);
+  for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+    gzprintf(power_trace_file, "%f,", sample_cmp_pwr[i]);
+  }
+  gzprintf(power_trace_file, "\n");
+
+  close_files();
+}
+
+void gpgpu_sim_wrapper::update_coefficients()
+{
+
+  initpower_coeff[FP_INT]=proc->cores[0]->get_coefficient_fpint_insts();
+  effpower_coeff[FP_INT]=initpower_coeff[FP_INT] * p->sys.scaling_coefficients[FP_INT];
+
+  initpower_coeff[TOT_INST]=proc->cores[0]->get_coefficient_tot_insts();
+  effpower_coeff[TOT_INST]=initpower_coeff[TOT_INST] * p->sys.scaling_coefficients[TOT_INST];
+
+  initpower_coeff[REG_RD]=proc->cores[0]->get_coefficient_regreads_accesses()*(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+  initpower_coeff[REG_WR]=proc->cores[0]->get_coefficient_regwrites_accesses()*(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+  initpower_coeff[NON_REG_OPs]=proc->cores[0]->get_coefficient_noregfileops_accesses()*(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+  effpower_coeff[REG_RD]=initpower_coeff[REG_RD]*p->sys.scaling_coefficients[REG_RD];
+  effpower_coeff[REG_WR]=initpower_coeff[REG_WR]*p->sys.scaling_coefficients[REG_WR];
+  effpower_coeff[NON_REG_OPs]=initpower_coeff[NON_REG_OPs]*p->sys.scaling_coefficients[NON_REG_OPs];
+
+  initpower_coeff[IC_H]=proc->cores[0]->get_coefficient_icache_hits();
+  initpower_coeff[IC_M]=proc->cores[0]->get_coefficient_icache_misses();
+  effpower_coeff[IC_H]=initpower_coeff[IC_H]*p->sys.scaling_coefficients[IC_H];
+  effpower_coeff[IC_M]=initpower_coeff[IC_M]*p->sys.scaling_coefficients[IC_M];
+
+  initpower_coeff[CC_H]=(proc->cores[0]->get_coefficient_ccache_readhits()+proc->get_coefficient_readcoalescing());
+  initpower_coeff[CC_M]=(proc->cores[0]->get_coefficient_ccache_readmisses()+proc->get_coefficient_readcoalescing());
+  effpower_coeff[CC_H]=initpower_coeff[CC_H]*p->sys.scaling_coefficients[CC_H];
+  effpower_coeff[CC_M]=initpower_coeff[CC_M]*p->sys.scaling_coefficients[CC_M];
+
+  initpower_coeff[TC_H]=(proc->cores[0]->get_coefficient_tcache_readhits()+proc->get_coefficient_readcoalescing());
+  initpower_coeff[TC_M]=(proc->cores[0]->get_coefficient_tcache_readmisses()+proc->get_coefficient_readcoalescing());
+  effpower_coeff[TC_H]=initpower_coeff[TC_H]*p->sys.scaling_coefficients[TC_H];
+  effpower_coeff[TC_M]=initpower_coeff[TC_M]*p->sys.scaling_coefficients[TC_M];
+
+  initpower_coeff[SHRD_ACC]=proc->cores[0]->get_coefficient_sharedmemory_readhits();
+  effpower_coeff[SHRD_ACC]=initpower_coeff[SHRD_ACC]*p->sys.scaling_coefficients[SHRD_ACC];
+
+  initpower_coeff[DC_RH]=(proc->cores[0]->get_coefficient_dcache_readhits() + proc->get_coefficient_readcoalescing());
+  initpower_coeff[DC_RM]=(proc->cores[0]->get_coefficient_dcache_readmisses() + proc->get_coefficient_readcoalescing());
+  initpower_coeff[DC_WH]=(proc->cores[0]->get_coefficient_dcache_writehits() + proc->get_coefficient_writecoalescing());
+  initpower_coeff[DC_WM]=(proc->cores[0]->get_coefficient_dcache_writemisses() + proc->get_coefficient_writecoalescing());
+  effpower_coeff[DC_RH]=initpower_coeff[DC_RH]*p->sys.scaling_coefficients[DC_RH];
+  effpower_coeff[DC_RM]=initpower_coeff[DC_RM]*p->sys.scaling_coefficients[DC_RM];
+  effpower_coeff[DC_WH]=initpower_coeff[DC_WH]*p->sys.scaling_coefficients[DC_WH];
+  effpower_coeff[DC_WM]=initpower_coeff[DC_WM]*p->sys.scaling_coefficients[DC_WM];
+
+  initpower_coeff[L2_RH]=proc->get_coefficient_l2_read_hits();
+  initpower_coeff[L2_RM]=proc->get_coefficient_l2_read_misses();
+  initpower_coeff[L2_WH]=proc->get_coefficient_l2_write_hits();
+  initpower_coeff[L2_WM]=proc->get_coefficient_l2_write_misses();
+  effpower_coeff[L2_RH]=initpower_coeff[L2_RH]*p->sys.scaling_coefficients[L2_RH];
+  effpower_coeff[L2_RM]=initpower_coeff[L2_RM]*p->sys.scaling_coefficients[L2_RM];
+  effpower_coeff[L2_WH]=initpower_coeff[L2_WH]*p->sys.scaling_coefficients[L2_WH];
+  effpower_coeff[L2_WM]=initpower_coeff[L2_WM]*p->sys.scaling_coefficients[L2_WM];
+
+  initpower_coeff[IDLE_CORE_N]=p->sys.idle_core_power * proc->cores[0]->executionTime;
+  effpower_coeff[IDLE_CORE_N]=initpower_coeff[IDLE_CORE_N]*p->sys.scaling_coefficients[IDLE_CORE_N];
+
+  initpower_coeff[PIPE_A]=proc->cores[0]->get_coefficient_duty_cycle();
+  effpower_coeff[PIPE_A]=initpower_coeff[PIPE_A]*p->sys.scaling_coefficients[PIPE_A];
+
+  initpower_coeff[MEM_RD]=proc->get_coefficient_mem_reads();
+  initpower_coeff[MEM_WR]=proc->get_coefficient_mem_writes();
+  initpower_coeff[MEM_PRE]=proc->get_coefficient_mem_pre();
+  effpower_coeff[MEM_RD]=initpower_coeff[MEM_RD]*p->sys.scaling_coefficients[MEM_RD];
+  effpower_coeff[MEM_WR]=initpower_coeff[MEM_WR]*p->sys.scaling_coefficients[MEM_WR];
+  effpower_coeff[MEM_PRE]=initpower_coeff[MEM_PRE]*p->sys.scaling_coefficients[MEM_PRE];
+  
+  double fp_coeff = proc->cores[0]->get_coefficient_fpu_accesses();
+  double sfu_coeff = proc->cores[0]->get_coefficient_sfu_accesses();
+
+  initpower_coeff[INT_ACC]= proc->cores[0]->get_coefficient_ialu_accesses()*(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+
+  if(tot_fpu_accesses != 0){
+    initpower_coeff[FP_ACC]= fp_coeff * sample_perf_counters[FP_ACC]/tot_fpu_accesses;
+    initpower_coeff[DP_ACC]= fp_coeff * sample_perf_counters[DP_ACC]/tot_fpu_accesses;
+  }
+  else{
+    initpower_coeff[FP_ACC]= 0;
+    initpower_coeff[DP_ACC]= 0;
+  }
+
+  if(tot_sfu_accesses != 0){
+    initpower_coeff[INT_MUL24_ACC]= sfu_coeff * sample_perf_counters[INT_MUL24_ACC]/tot_sfu_accesses;
+    initpower_coeff[INT_MUL32_ACC]= sfu_coeff * sample_perf_counters[INT_MUL32_ACC]/tot_sfu_accesses;
+    initpower_coeff[INT_MUL_ACC]= sfu_coeff * sample_perf_counters[INT_MUL_ACC]/tot_sfu_accesses;
+    initpower_coeff[INT_DIV_ACC]= sfu_coeff * sample_perf_counters[INT_DIV_ACC]/tot_sfu_accesses;
+    initpower_coeff[DP_MUL_ACC]= sfu_coeff * sample_perf_counters[DP_MUL_ACC]/tot_sfu_accesses;
+    initpower_coeff[DP_DIV_ACC]= sfu_coeff * sample_perf_counters[DP_DIV_ACC]/tot_sfu_accesses;
+    initpower_coeff[FP_MUL_ACC]= sfu_coeff * sample_perf_counters[FP_MUL_ACC]/tot_sfu_accesses;
+    initpower_coeff[FP_DIV_ACC]= sfu_coeff * sample_perf_counters[FP_DIV_ACC]/tot_sfu_accesses;
+    initpower_coeff[FP_SQRT_ACC]= sfu_coeff * sample_perf_counters[FP_SQRT_ACC]/tot_sfu_accesses;
+    initpower_coeff[FP_LG_ACC]= sfu_coeff * sample_perf_counters[FP_LG_ACC]/tot_sfu_accesses;
+    initpower_coeff[FP_SIN_ACC]= sfu_coeff * sample_perf_counters[FP_SIN_ACC]/tot_sfu_accesses;
+    initpower_coeff[FP_EXP_ACC]= sfu_coeff * sample_perf_counters[FP_EXP_ACC]/tot_sfu_accesses;
+    initpower_coeff[TENSOR_ACC]= sfu_coeff * sample_perf_counters[TENSOR_ACC]/tot_sfu_accesses;
+    initpower_coeff[TEX_ACC]= sfu_coeff * sample_perf_counters[TEX_ACC]/tot_sfu_accesses;
+  }
+  else{
+    initpower_coeff[INT_MUL24_ACC]= 0;
+    initpower_coeff[INT_MUL32_ACC]= 0;
+    initpower_coeff[INT_MUL_ACC]= 0;
+    initpower_coeff[INT_DIV_ACC]= 0;
+    initpower_coeff[DP_MUL_ACC]= 0;
+    initpower_coeff[DP_DIV_ACC]= 0;
+    initpower_coeff[FP_MUL_ACC]= 0;
+    initpower_coeff[FP_DIV_ACC]= 0;
+    initpower_coeff[FP_SQRT_ACC]= 0;
+    initpower_coeff[FP_LG_ACC]= 0;
+    initpower_coeff[FP_SIN_ACC]= 0;
+    initpower_coeff[FP_EXP_ACC]= 0;
+    initpower_coeff[TENSOR_ACC]= 0;
+    initpower_coeff[TEX_ACC]= 0;
+  }
+
+  effpower_coeff[INT_ACC]= initpower_coeff[INT_ACC];
+  effpower_coeff[FP_ACC]= initpower_coeff[FP_ACC];
+  effpower_coeff[DP_ACC]= initpower_coeff[DP_ACC];
+  effpower_coeff[INT_MUL24_ACC]= initpower_coeff[INT_MUL24_ACC];
+  effpower_coeff[INT_MUL32_ACC]= initpower_coeff[INT_MUL32_ACC];
+  effpower_coeff[INT_MUL_ACC]= initpower_coeff[INT_MUL_ACC];
+  effpower_coeff[INT_DIV_ACC]= initpower_coeff[INT_DIV_ACC];
+  effpower_coeff[DP_MUL_ACC]= initpower_coeff[DP_MUL_ACC];
+  effpower_coeff[DP_DIV_ACC]= initpower_coeff[DP_DIV_ACC];
+  effpower_coeff[FP_MUL_ACC]= initpower_coeff[FP_MUL_ACC];
+  effpower_coeff[FP_DIV_ACC]= initpower_coeff[FP_DIV_ACC];
+  effpower_coeff[FP_SQRT_ACC]= initpower_coeff[FP_SQRT_ACC];
+  effpower_coeff[FP_LG_ACC]= initpower_coeff[FP_LG_ACC];
+  effpower_coeff[FP_SIN_ACC]= initpower_coeff[FP_SIN_ACC];
+  effpower_coeff[FP_EXP_ACC]= initpower_coeff[FP_EXP_ACC];
+  effpower_coeff[TENSOR_ACC]= initpower_coeff[TENSOR_ACC];
+  effpower_coeff[TEX_ACC]= initpower_coeff[TEX_ACC];
+
+  initpower_coeff[NOC_A]=proc->get_coefficient_noc_accesses();
+  effpower_coeff[NOC_A]=initpower_coeff[NOC_A]*p->sys.scaling_coefficients[NOC_A];
+
+  //const_dynamic_power=proc->get_const_dynamic_power()/(proc->cores[0]->executionTime);
+
+  for(unsigned i=0; i<num_perf_counters; i++){
+    initpower_coeff[i]/=(proc->cores[0]->executionTime);
+    effpower_coeff[i]/=(proc->cores[0]->executionTime);
+  }
+}
+
+double gpgpu_sim_wrapper::calculate_static_power(){ 
+	double int_accesses = initpower_coeff[INT_ACC] + initpower_coeff[INT_MUL24_ACC] + initpower_coeff[INT_MUL32_ACC] + initpower_coeff[INT_MUL_ACC] + initpower_coeff[INT_DIV_ACC];
+	double int_add_accesses = initpower_coeff[INT_ACC];
+	double int_mul_accesses = initpower_coeff[INT_MUL24_ACC] + initpower_coeff[INT_MUL32_ACC] + initpower_coeff[INT_MUL_ACC] + initpower_coeff[INT_DIV_ACC];
+	double fp_accesses = initpower_coeff[FP_ACC] + initpower_coeff[FP_MUL_ACC] + initpower_coeff[FP_DIV_ACC];
+	double dp_accesses = initpower_coeff[DP_ACC] + initpower_coeff[DP_MUL_ACC] + initpower_coeff[DP_DIV_ACC];
+	double sfu_accesses = initpower_coeff[FP_SQRT_ACC] + initpower_coeff[FP_LG_ACC] + initpower_coeff[FP_SIN_ACC] + initpower_coeff[FP_EXP_ACC];
+	double tensor_accesses = initpower_coeff[TENSOR_ACC];
+	double tex_accesses = initpower_coeff[TEX_ACC];
+	double total_static_power = 0.0;
+	double base_static_power = 0.0; 
+	double lane_static_power = 0.0;
+	double per_active_core = (num_cores - num_idle_cores)/num_cores;
+
+
+	double l1_accesses = initpower_coeff[DC_RH] + initpower_coeff[DC_RM] + initpower_coeff[DC_WH] + initpower_coeff[DC_WM];
+	double l2_accesses = initpower_coeff[L2_RH] + initpower_coeff[L2_RM] + initpower_coeff[L2_WH] + initpower_coeff[L2_WM];
+	double shared_accesses = initpower_coeff[SHRD_ACC];
+
+
+	if(avg_threads_per_warp == 0){ //no functional unit threads, check for memory or a 'LIGHT_SM'
+		if(l1_accesses != 0.0)
+			return (p->sys.static_l1_flane*per_active_core);
+		else if(shared_accesses != 0.0)
+			return (p->sys.static_shared_flane*per_active_core);
+		else if(l2_accesses != 0.0)
+			return (p->sys.static_l2_flane*per_active_core);
+		else //LIGHT_SM
+			return (p->sys.static_light_flane*per_active_core); //return LIGHT_SM base static power
+	}
+
+	/* using a linear model for thread divergence */
+	if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses != 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
+		/* INT_FP_DP */
+		base_static_power = p->sys.static_cat3_flane;
+		lane_static_power = p->sys.static_cat3_addlane;
+	}
+
+	else if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses != 0.0) && (tex_accesses == 0.0)){
+		/* INT_FP_TENSOR */
+		base_static_power = p->sys.static_cat6_flane;
+		lane_static_power = p->sys.static_cat6_addlane;
+	}
+
+	else if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses == 0.0) && (sfu_accesses != 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
+		/* INT_FP_SFU */
+		base_static_power = p->sys.static_cat4_flane;
+		lane_static_power = p->sys.static_cat4_addlane;
+	}
+
+	else if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses != 0.0)){
+		/* INT_FP_TEX */
+		base_static_power = p->sys.static_cat5_flane;
+		lane_static_power = p->sys.static_cat5_addlane;
+	}
+
+	else if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
+		/* INT_FP */
+		base_static_power = p->sys.static_cat2_flane;
+		lane_static_power = p->sys.static_cat2_addlane;
+	}
+
+	else if((int_accesses != 0.0) && (fp_accesses == 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
+		/* INT */
+		/* Seperating INT_ADD only and INT_MUL only from mix of INT instructions */
+		if((int_add_accesses != 0.0) && (int_mul_accesses == 0.0)){ //INT_ADD
+			base_static_power = p->sys.static_intadd_flane;
+			lane_static_power = p->sys.static_intadd_addlane;
+		}
+		else if((int_add_accesses == 0.0) && (int_mul_accesses != 0.0)){ //INT_MUL
+			base_static_power = p->sys.static_intmul_flane;
+			lane_static_power = p->sys.static_intmul_addlane;
+		}
+		else{ //INT_ADD+MUL
+			base_static_power = p->sys.static_cat1_flane;
+			lane_static_power = p->sys.static_cat1_addlane;
+		}
+	}
+
+	else if((int_accesses == 0.0) && (fp_accesses == 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
+		/* LIGHT_SM or memory only sample */
+		lane_static_power = 0.0; //addlane static power is 0 for l1/l2/shared memory only accesses
+		if(l1_accesses != 0.0)
+			base_static_power = p->sys.static_l1_flane;
+		else if(shared_accesses != 0.0)
+			base_static_power = p->sys.static_shared_flane;
+		else if(l2_accesses != 0.0)
+			base_static_power = p->sys.static_l2_flane;
+		else{
+			base_static_power = p->sys.static_light_flane;
+			lane_static_power = p->sys.static_light_addlane;
+		}
+	}
+	else{
+		base_static_power = p->sys.static_geomean_flane; //GEOMEAN except LIGHT_SM if we don't fall into any of the categories above
+		lane_static_power = p->sys.static_geomean_addlane;
+	}
+
+	total_static_power = base_static_power + (((double)avg_threads_per_warp-1.0)*lane_static_power); //Linear Model
+	return (total_static_power*per_active_core);
+}
+
+void gpgpu_sim_wrapper::update_components_power()
+{
+
+  update_coefficients();
+
+  proc_power=proc->rt_power.readOp.dynamic;
+  sample_cmp_pwr[IBP]=(proc->cores[0]->ifu->IB->rt_power.readOp.dynamic
+          +proc->cores[0]->ifu->IB->rt_power.writeOp.dynamic
+          +proc->cores[0]->ifu->ID_misc->rt_power.readOp.dynamic
+          +proc->cores[0]->ifu->ID_operand->rt_power.readOp.dynamic
+          +proc->cores[0]->ifu->ID_inst->rt_power.readOp.dynamic)/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[ICP]=proc->cores[0]->ifu->icache.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[DCP]=proc->cores[0]->lsu->dcache.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[TCP]=proc->cores[0]->lsu->tcache.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[CCP]=proc->cores[0]->lsu->ccache.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[SHRDP]=proc->cores[0]->lsu->sharedmemory.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[RFP]=(proc->cores[0]->exu->rfu->rt_power.readOp.dynamic/(proc->cores[0]->executionTime))
+         *(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+
+  double sample_fp_pwr = (proc->cores[0]->exu->fp_u->rt_power.readOp.dynamic/(proc->cores[0]->executionTime));
+
+  double sample_sfu_pwr = (proc->cores[0]->exu->mul->rt_power.readOp.dynamic/(proc->cores[0]->executionTime));
+
+  sample_cmp_pwr[INTP]=(proc->cores[0]->exu->exeu->rt_power.readOp.dynamic/(proc->cores[0]->executionTime))
+         *(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+
+  
+  if(tot_fpu_accesses != 0){
+    sample_cmp_pwr[FPUP]= sample_fp_pwr * sample_perf_counters[FP_ACC]/tot_fpu_accesses;
+    sample_cmp_pwr[DPUP]= sample_fp_pwr * sample_perf_counters[DP_ACC]/tot_fpu_accesses;
+  }
+  else{
+    sample_cmp_pwr[FPUP]= 0;
+    sample_cmp_pwr[DPUP]= 0;
+  }
+  if(tot_sfu_accesses != 0){
+    sample_cmp_pwr[INT_MUL24P]= sample_sfu_pwr * sample_perf_counters[INT_MUL24_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[INT_MUL32P]= sample_sfu_pwr * sample_perf_counters[INT_MUL32_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[INT_MULP]= sample_sfu_pwr * sample_perf_counters[INT_MUL_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[INT_DIVP]= sample_sfu_pwr * sample_perf_counters[INT_DIV_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[FP_MULP]= sample_sfu_pwr * sample_perf_counters[FP_MUL_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[FP_DIVP]= sample_sfu_pwr * sample_perf_counters[FP_DIV_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[FP_SQRTP]= sample_sfu_pwr * sample_perf_counters[FP_SQRT_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[FP_LGP]= sample_sfu_pwr * sample_perf_counters[FP_LG_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[FP_SINP]= sample_sfu_pwr * sample_perf_counters[FP_SIN_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[FP_EXP]= sample_sfu_pwr * sample_perf_counters[FP_EXP_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[DP_MULP]= sample_sfu_pwr * sample_perf_counters[DP_MUL_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[DP_DIVP]= sample_sfu_pwr * sample_perf_counters[DP_DIV_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[TENSORP]= sample_sfu_pwr * sample_perf_counters[TENSOR_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[TEXP]= sample_sfu_pwr * sample_perf_counters[TEX_ACC]/tot_sfu_accesses;
+  }
+  else{
+    sample_cmp_pwr[INT_MUL24P]= 0;
+    sample_cmp_pwr[INT_MUL32P]= 0;
+    sample_cmp_pwr[INT_MULP]= 0;
+    sample_cmp_pwr[INT_DIVP]= 0;
+    sample_cmp_pwr[FP_MULP]= 0;
+    sample_cmp_pwr[FP_DIVP]= 0;
+    sample_cmp_pwr[FP_SQRTP]= 0;
+    sample_cmp_pwr[FP_LGP]= 0;
+    sample_cmp_pwr[FP_SINP]= 0;
+    sample_cmp_pwr[FP_EXP]= 0;
+    sample_cmp_pwr[DP_MULP]= 0;
+    sample_cmp_pwr[DP_DIVP]= 0;
+    sample_cmp_pwr[TENSORP]= 0;
+    sample_cmp_pwr[TEXP]= 0;
+  }
+
+  sample_cmp_pwr[SCHEDP]=proc->cores[0]->exu->scheu->rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[L2CP]=(proc->XML->sys.number_of_L2s>0)? proc->l2array[0]->rt_power.readOp.dynamic/(proc->cores[0]->executionTime):0;
+
+  sample_cmp_pwr[MCP]=(proc->mc->rt_power.readOp.dynamic-proc->mc->dram->rt_power.readOp.dynamic)/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[NOCP]=proc->nocs[0]->rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[DRAMP]=proc->mc->dram->rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[PIPEP]=proc->cores[0]->Pipeline_energy/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[IDLE_COREP]=proc->cores[0]->IdleCoreEnergy/(proc->cores[0]->executionTime);
+
+  // This constant dynamic power (e.g., clock power) part is estimated via regression model.
+  sample_cmp_pwr[CONSTP]=0;
+  sample_cmp_pwr[STATICP]=0;
+  // double cnst_dyn = proc->get_const_dynamic_power()/(proc->cores[0]->executionTime);
+  // // If the regression scaling term is greater than the recorded constant dynamic power
+  // // then use the difference (other portion already added to dynamic power). Else,
+  // // all the constant dynamic power is accounted for, add nothing.
+  // if(p->sys.scaling_coefficients[constant_power] > cnst_dyn)
+  //   sample_cmp_pwr[CONSTP] = (p->sys.scaling_coefficients[constant_power]-cnst_dyn);
+  sample_cmp_pwr[CONSTP] = p->sys.scaling_coefficients[constant_power];
+  sample_cmp_pwr[STATICP] = calculate_static_power();
+
+  if(g_dvfs_enabled){
+  	double voltage_ratio = modeled_chip_voltage/p->sys.modeled_chip_voltage_ref; 
+  	sample_cmp_pwr[IDLE_COREP] *= voltage_ratio; // static power scaled by voltage_ratio
+  	sample_cmp_pwr[STATICP] *= voltage_ratio;  // static power scaled by voltage_ratio
+  	for(unsigned i=0; i<num_pwr_cmps; i++){
+    	if((i != IDLE_COREP) && (i != STATICP)){ 
+    		sample_cmp_pwr[i] *= voltage_ratio*voltage_ratio; // dynamic power scaled by square of voltage_ratio
+    	}
+  	}
+  }
+  
+  proc_power+=sample_cmp_pwr[CONSTP]+sample_cmp_pwr[STATICP];
+  if(!g_dvfs_enabled){ // sanity check will fail when voltage scaling is applied, fix later
+	  double sum_pwr_cmp=0;
+	  for(unsigned i=0; i<num_pwr_cmps; i++){
+	    sum_pwr_cmp+=sample_cmp_pwr[i];
+	  }
+	  bool check=false;
+	  check=sanity_check(sum_pwr_cmp,proc_power);
+	  if(!check)
+	    printf("sum_pwr_cmp %f : proc_power %f \n",sum_pwr_cmp,proc_power);
+	  assert("Total Power does not equal the sum of the components\n" && (check));
+  }
+}
+
+void gpgpu_sim_wrapper::compute() { proc->compute(); }
+void gpgpu_sim_wrapper::print_power_kernel_stats(
+    double gpu_sim_cycle, double gpu_tot_sim_cycle, double init_value,
+    const std::string& kernel_info_string, bool print_trace) {
+  detect_print_steady_state(1, init_value);
+  if (g_power_simulation_enabled) {
+    powerfile << kernel_info_string << std::endl;
+
+    sanity_check((kernel_power.avg * kernel_sample_count), kernel_tot_power);
+    powerfile << "Kernel Average Power Data:" << std::endl;
+    powerfile << "kernel_avg_power = " << kernel_power.avg << std::endl;
+
+    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+      powerfile << "gpu_avg_" << pwr_cmp_label[i] << " = "
+                << kernel_cmp_pwr[i].avg / kernel_sample_count << std::endl;
+    }
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      powerfile << "gpu_avg_" << perf_count_label[i] << " = "
+                << kernel_cmp_perf_counters[i].avg / kernel_sample_count
+                << std::endl;
+    }
+
+    powerfile << "gpu_avg_threads_per_warp = "
+                << avg_threads_per_warp_tot / (double)kernel_sample_count
+                << std::endl;
+
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      powerfile << "gpu_tot_" << perf_count_label[i] << " = "
+                << kernel_cmp_perf_counters[i].avg
+                << std::endl;
+    }
+
+    powerfile << std::endl << "Kernel Maximum Power Data:" << std::endl;
+    powerfile << "kernel_max_power = " << kernel_power.max << std::endl;
+    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+      powerfile << "gpu_max_" << pwr_cmp_label[i] << " = "
+                << kernel_cmp_pwr[i].max << std::endl;
+    }
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      powerfile << "gpu_max_" << perf_count_label[i] << " = "
+                << kernel_cmp_perf_counters[i].max << std::endl;
+    }
+
+    powerfile << std::endl << "Kernel Minimum Power Data:" << std::endl;
+    powerfile << "kernel_min_power = " << kernel_power.min << std::endl;
+    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+      powerfile << "gpu_min_" << pwr_cmp_label[i] << " = "
+                << kernel_cmp_pwr[i].min << std::endl;
+    }
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      powerfile << "gpu_min_" << perf_count_label[i] << " = "
+                << kernel_cmp_perf_counters[i].min << std::endl;
+    }
+
+    powerfile << std::endl
+              << "Accumulative Power Statistics Over Previous Kernels:"
+              << std::endl;
+    powerfile << "gpu_tot_avg_power = "
+              << gpu_tot_power.avg / total_sample_count << std::endl;
+    powerfile << "gpu_tot_max_power = " << gpu_tot_power.max << std::endl;
+    powerfile << "gpu_tot_min_power = " << gpu_tot_power.min << std::endl;
+    powerfile << std::endl << std::endl;
+    powerfile.flush();
+
+    if (print_trace) {
+      print_trace_files();
+    }
+  }
+}
+void gpgpu_sim_wrapper::dump() {
+  if (g_power_per_cycle_dump) proc->displayEnergy(2, 5);
+}
+
+void gpgpu_sim_wrapper::print_steady_state(int position, double init_val) {
+  double temp_avg = sample_val / (double)samples.size();
+  double temp_ipc = (init_val - init_inst_val) /
+                    (double)(samples.size() * gpu_stat_sample_freq);
+
+  if ((samples.size() >
+       gpu_steady_min_period)) {  // If steady state occurred for some time,
+                                  // print to file
+    has_written_avg = true;
+    gzprintf(steady_state_tacking_file, "%u,%d,%f,%f,", sample_start,
+             total_sample_count, temp_avg, temp_ipc);
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      gzprintf(steady_state_tacking_file, "%f,",
+               samples_counter.at(i) / ((double)samples.size()));
+    }
+    gzprintf(steady_state_tacking_file, "\n");
+  } else {
+    if (!has_written_avg && position)
+      gzprintf(steady_state_tacking_file,
+               "ERROR! Not enough steady state points to generate average\n");
+  }
+
+  sample_start = 0;
+  sample_val = 0;
+  init_inst_val = init_val;
+  samples.clear();
+  samples_counter.clear();
+  pwr_counter.clear();
+  assert(samples.size() == 0);
+}
+
+void gpgpu_sim_wrapper::detect_print_steady_state(int position,
+                                                  double init_val) {
+  // Calculating Average
+  if (g_power_simulation_enabled && g_steady_power_levels_enabled) {
+    steady_state_tacking_file = gzopen(g_steady_state_tracking_filename, "a");
+    if (position == 0) {
+      if (samples.size() == 0) {
+        // First sample
+        sample_start = total_sample_count;
+        sample_val = proc->rt_power.readOp.dynamic;
+        init_inst_val = init_val;
+        samples.push_back(proc->rt_power.readOp.dynamic);
+        assert(samples_counter.size() == 0);
+        assert(pwr_counter.size() == 0);
+
+        for (unsigned i = 0; i < (num_perf_counters); ++i) {
+          samples_counter.push_back(sample_perf_counters[i]);
+        }
+
+        for (unsigned i = 0; i < (num_pwr_cmps); ++i) {
+          pwr_counter.push_back(sample_cmp_pwr[i]);
+        }
+        assert(pwr_counter.size() == (double)num_pwr_cmps);
+        assert(samples_counter.size() == (double)num_perf_counters);
+      } else {
+        // Get current average
+        double temp_avg = sample_val / (double)samples.size();
+
+        if (abs(proc->rt_power.readOp.dynamic - temp_avg) <
+            gpu_steady_power_deviation) {  // Value is within threshold
+          sample_val += proc->rt_power.readOp.dynamic;
+          samples.push_back(proc->rt_power.readOp.dynamic);
+          for (unsigned i = 0; i < (num_perf_counters); ++i) {
+            samples_counter.at(i) += sample_perf_counters[i];
+          }
+
+          for (unsigned i = 0; i < (num_pwr_cmps); ++i) {
+            pwr_counter.at(i) += sample_cmp_pwr[i];
+          }
+
+        } else {  // Value exceeds threshold, not considered steady state
+          print_steady_state(position, init_val);
+        }
+      }
+    } else {
+      print_steady_state(position, init_val);
+    }
+    gzclose(steady_state_tacking_file);
+  }
+}
+
+void gpgpu_sim_wrapper::open_files() {
+  if (g_power_simulation_enabled) {
+    if (g_power_trace_enabled) {
+      power_trace_file = gzopen(g_power_trace_filename, "a");
+      metric_trace_file = gzopen(g_metric_trace_filename, "a");
+    }
+  }
+}
+void gpgpu_sim_wrapper::close_files() {
+  if (g_power_simulation_enabled) {
+    if (g_power_trace_enabled) {
+      gzclose(power_trace_file);
+      gzclose(metric_trace_file);
+    }
+  }
+}
diff --git a/src/gpuwattch/gpgpu_sim_wrapper.h b/src/accelwattch/gpgpu_sim_wrapper.h
similarity index 68%
rename from src/gpuwattch/gpgpu_sim_wrapper.h
rename to src/accelwattch/gpgpu_sim_wrapper.h
index 00e4f0746..33c4b72f2 100644
--- a/src/gpuwattch/gpgpu_sim_wrapper.h
+++ b/src/accelwattch/gpgpu_sim_wrapper.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy,
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -54,9 +55,34 @@ struct avg_max_min_counters {
   }
 };
 
+#ifndef COEFF_STRUCT
+#define COEFF_STRUCT
+
+struct PowerscalingCoefficients{
+    double int_coeff;
+    double int_mul_coeff;
+    double int_mul24_coeff;
+    double int_mul32_coeff;
+    double int_div_coeff;
+    double fp_coeff;
+    double dp_coeff;
+    double fp_mul_coeff;
+    double fp_div_coeff;
+    double dp_mul_coeff;
+    double dp_div_coeff;
+    double sqrt_coeff;
+    double log_coeff;
+    double sin_coeff;
+    double exp_coeff;
+    double tensor_coeff;
+    double tex_coeff;
+};
+
+#endif
+
 class gpgpu_sim_wrapper {
  public:
-  gpgpu_sim_wrapper(bool power_simulation_enabled, char* xmlfile);
+  gpgpu_sim_wrapper(bool power_simulation_enabled, char* xmlfile, int power_simulation_mode, bool dvfs_enabled);
   ~gpgpu_sim_wrapper();
 
   void init_mcpat(char* xmlfile, char* powerfile, char* power_trace_file,
@@ -64,7 +90,9 @@ class gpgpu_sim_wrapper {
                   bool power_sim_enabled, bool trace_enabled,
                   bool steady_state_enabled, bool power_per_cycle_dump,
                   double steady_power_deviation, double steady_min_period,
-                  int zlevel, double init_val, int stat_sample_freq);
+                  int zlevel, double init_val, int stat_sample_freq, int power_sim_mode, 
+                  bool dvfs_enabled, unsigned clock_freq, unsigned num_shaders);
+  void init_mcpat_hw_mode(unsigned gpu_sim_cycle);
   void detect_print_steady_state(int position, double init_val);
   void close_files();
   void open_files();
@@ -72,6 +100,7 @@ class gpgpu_sim_wrapper {
   void dump();
   void print_trace_files();
   void update_components_power();
+  double calculate_static_power();
   void update_coefficients();
   void reset_counters();
   void print_power_kernel_stats(double gpu_sim_cycle, double gpu_tot_sim_cycle,
@@ -79,6 +108,7 @@ class gpgpu_sim_wrapper {
                                 const std::string& kernel_info_string,
                                 bool print_trace);
   void power_metrics_calculations();
+  void set_model_voltage(double model_voltage);
   void set_inst_power(bool clk_gated_lanes, double tot_cycles,
                       double busy_cycles, double tot_inst, double int_inst,
                       double fp_inst, double load_inst, double store_inst,
@@ -92,16 +122,31 @@ class gpgpu_sim_wrapper {
                          double write_accesses, double write_misses);
   void set_l2cache_power(double read_accesses, double read_misses,
                          double write_accesses, double write_misses);
+  void set_num_cores(double num_core);
   void set_idle_core_power(double num_idle_core);
   void set_duty_cycle_power(double duty_cycle);
   void set_mem_ctrl_power(double reads, double writes, double dram_precharge);
   void set_exec_unit_power(double fpu_accesses, double ialu_accesses,
                            double sfu_accesses);
+  void set_int_accesses(double ialu_accesses, double imul24_accesses, 
+                        double imul32_accesses, double imul_accesses, 
+                        double idiv_accesses);
+  void set_dp_accesses(double dpu_accesses, double dpmul_accesses, 
+                       double dpdiv_accesses);
+  void set_fp_accesses(double fpu_accesses, double fpmul_accesses, 
+                       double fpdiv_accesses);
+  void set_trans_accesses(double sqrt_accesses, double log_accesses, 
+                       double sin_accesses, double exp_accesses);
+  void set_tensor_accesses(double tensor_accesses);
+  void set_tex_accesses(double tex_accesses);
+  void set_avg_active_threads(float active_threads);
   void set_active_lanes_power(double sp_avg_active_lane,
                               double sfu_avg_active_lane);
-  void set_NoC_power(double noc_tot_reads, double noc_tot_write);
+  void set_NoC_power(double noc_tot_acc);
   bool sanity_check(double a, double b);
 
+  PowerscalingCoefficients * get_scaling_coeffs();
+
  private:
   void print_steady_state(int position, double init_val);
 
@@ -109,8 +154,10 @@ class gpgpu_sim_wrapper {
   ParseXML* p;
   // power parameters
   double const_dynamic_power;
+  double avg_threads_per_warp_tot;
   double proc_power;
-
+  double num_cores;
+  double num_idle_cores;
   unsigned num_perf_counters;  // # of performance counters
   unsigned num_pwr_cmps;       // # of components modelled
   int kernel_sample_count;     // # of samples per kernel
@@ -140,6 +187,10 @@ class gpgpu_sim_wrapper {
   unsigned sample_start;
   double sample_val;
   double init_inst_val;
+  double tot_sfu_accesses;
+  double tot_fpu_accesses;
+  double modeled_chip_voltage;
+  unsigned avg_threads_per_warp;
   std::vector<double> samples;
   std::vector<double> samples_counter;
   std::vector<double> pwr_counter;
@@ -150,6 +201,8 @@ class gpgpu_sim_wrapper {
   char* g_metric_trace_filename;
   char* g_steady_state_tracking_filename;
   bool g_power_simulation_enabled;
+  int g_power_simulation_mode;
+  bool g_dvfs_enabled;
   bool g_steady_power_levels_enabled;
   bool g_power_trace_enabled;
   bool g_power_per_cycle_dump;
diff --git a/src/gpuwattch/gpgpu_static.xml b/src/accelwattch/gpgpu_static.xml
similarity index 100%
rename from src/gpuwattch/gpgpu_static.xml
rename to src/accelwattch/gpgpu_static.xml
diff --git a/src/gpuwattch/interconnect.cc b/src/accelwattch/interconnect.cc
similarity index 100%
rename from src/gpuwattch/interconnect.cc
rename to src/accelwattch/interconnect.cc
diff --git a/src/gpuwattch/interconnect.h b/src/accelwattch/interconnect.h
similarity index 100%
rename from src/gpuwattch/interconnect.h
rename to src/accelwattch/interconnect.h
diff --git a/src/gpuwattch/iocontrollers.cc b/src/accelwattch/iocontrollers.cc
similarity index 100%
rename from src/gpuwattch/iocontrollers.cc
rename to src/accelwattch/iocontrollers.cc
diff --git a/src/gpuwattch/iocontrollers.h b/src/accelwattch/iocontrollers.h
similarity index 100%
rename from src/gpuwattch/iocontrollers.h
rename to src/accelwattch/iocontrollers.h
diff --git a/src/gpuwattch/logic.cc b/src/accelwattch/logic.cc
similarity index 100%
rename from src/gpuwattch/logic.cc
rename to src/accelwattch/logic.cc
diff --git a/src/gpuwattch/logic.h b/src/accelwattch/logic.h
similarity index 100%
rename from src/gpuwattch/logic.h
rename to src/accelwattch/logic.h
diff --git a/src/gpuwattch/main.cc b/src/accelwattch/main.cc
similarity index 100%
rename from src/gpuwattch/main.cc
rename to src/accelwattch/main.cc
diff --git a/src/gpuwattch/makefile b/src/accelwattch/makefile
similarity index 100%
rename from src/gpuwattch/makefile
rename to src/accelwattch/makefile
diff --git a/src/gpuwattch/mcpat.mk b/src/accelwattch/mcpat.mk
similarity index 97%
rename from src/gpuwattch/mcpat.mk
rename to src/accelwattch/mcpat.mk
index a09c23b4c..ad2d6c299 100644
--- a/src/gpuwattch/mcpat.mk
+++ b/src/accelwattch/mcpat.mk
@@ -1,5 +1,5 @@
 
-OUTPUT_DIR=$(SIM_OBJ_FILES_DIR)/gpuwattch
+OUTPUT_DIR=$(SIM_OBJ_FILES_DIR)/accelwattch
 TARGET = mcpat
 SHELL = /bin/sh
 .PHONY: all depend clean
diff --git a/src/gpuwattch/mcpatXeonCore.mk b/src/accelwattch/mcpatXeonCore.mk
similarity index 100%
rename from src/gpuwattch/mcpatXeonCore.mk
rename to src/accelwattch/mcpatXeonCore.mk
diff --git a/src/gpuwattch/memoryctrl.cc b/src/accelwattch/memoryctrl.cc
similarity index 100%
rename from src/gpuwattch/memoryctrl.cc
rename to src/accelwattch/memoryctrl.cc
diff --git a/src/gpuwattch/memoryctrl.h b/src/accelwattch/memoryctrl.h
similarity index 100%
rename from src/gpuwattch/memoryctrl.h
rename to src/accelwattch/memoryctrl.h
diff --git a/src/gpuwattch/noc.cc b/src/accelwattch/noc.cc
similarity index 100%
rename from src/gpuwattch/noc.cc
rename to src/accelwattch/noc.cc
diff --git a/src/gpuwattch/noc.h b/src/accelwattch/noc.h
similarity index 100%
rename from src/gpuwattch/noc.h
rename to src/accelwattch/noc.h
diff --git a/src/gpuwattch/processor.cc b/src/accelwattch/processor.cc
similarity index 99%
rename from src/gpuwattch/processor.cc
rename to src/accelwattch/processor.cc
index fc6db463d..9e7f5b2c5 100644
--- a/src/gpuwattch/processor.cc
+++ b/src/accelwattch/processor.cc
@@ -30,11 +30,13 @@
  ***************************************************************************/
 /********************************************************************
  *      Modified by:
- ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
- *University of Wisconsin–Madison                * Tayler Hetherington,
- *University of British Columbia         * Ahmed ElTantawy, University of
- *British Columbia             *
+ * Jingwen Leng, University of Texas, Austin                
+ * Syed Gilani, University of Wisconsin–Madison         
+ * Tayler Hetherington, University of British Columbia
+ * Ahmed ElTantawy, University of British Columbia
+ * Vijay Kandiah, Northwestern University
  ********************************************************************/
+
 #include "processor.h"
 #include <assert.h>
 #include <stdio.h>
@@ -118,7 +120,7 @@ Processor::Processor(ParseXML *XML_interface)
       set_pppm(pppm_t, cores[i]->clockRate * procdynp.numCore, procdynp.numCore,
                procdynp.numCore, procdynp.numCore);
       // set the exClockRate
-      exClockRate = cores[0]->clockRate * 2;  // TODO; get from XML file
+      exClockRate = cores[0]->clockRate;  // TODO; get from XML file
       // cout<<"****EX clock rate:"<<exClockRate<<endl;
       core.power = core.power + cores[i]->power * pppm_t;
       set_pppm(pppm_t, 1 / cores[i]->executionTime, procdynp.numCore,
diff --git a/src/gpuwattch/processor.h b/src/accelwattch/processor.h
similarity index 100%
rename from src/gpuwattch/processor.h
rename to src/accelwattch/processor.h
diff --git a/src/gpuwattch/quadro.xml b/src/accelwattch/quadro.xml
similarity index 100%
rename from src/gpuwattch/quadro.xml
rename to src/accelwattch/quadro.xml
diff --git a/src/gpuwattch/results/Alpha21364 b/src/accelwattch/results/Alpha21364
similarity index 100%
rename from src/gpuwattch/results/Alpha21364
rename to src/accelwattch/results/Alpha21364
diff --git a/src/gpuwattch/results/Alpha21364_90nm b/src/accelwattch/results/Alpha21364_90nm
similarity index 100%
rename from src/gpuwattch/results/Alpha21364_90nm
rename to src/accelwattch/results/Alpha21364_90nm
diff --git a/src/gpuwattch/results/Penryn b/src/accelwattch/results/Penryn
similarity index 100%
rename from src/gpuwattch/results/Penryn
rename to src/accelwattch/results/Penryn
diff --git a/src/gpuwattch/results/T1 b/src/accelwattch/results/T1
similarity index 100%
rename from src/gpuwattch/results/T1
rename to src/accelwattch/results/T1
diff --git a/src/gpuwattch/results/T1_DC_64 b/src/accelwattch/results/T1_DC_64
similarity index 100%
rename from src/gpuwattch/results/T1_DC_64
rename to src/accelwattch/results/T1_DC_64
diff --git a/src/gpuwattch/results/T1_SBT_64 b/src/accelwattch/results/T1_SBT_64
similarity index 100%
rename from src/gpuwattch/results/T1_SBT_64
rename to src/accelwattch/results/T1_SBT_64
diff --git a/src/gpuwattch/results/T1_ST_64 b/src/accelwattch/results/T1_ST_64
similarity index 100%
rename from src/gpuwattch/results/T1_ST_64
rename to src/accelwattch/results/T1_ST_64
diff --git a/src/gpuwattch/results/T2 b/src/accelwattch/results/T2
similarity index 100%
rename from src/gpuwattch/results/T2
rename to src/accelwattch/results/T2
diff --git a/src/gpuwattch/results/Xeon_core b/src/accelwattch/results/Xeon_core
similarity index 100%
rename from src/gpuwattch/results/Xeon_core
rename to src/accelwattch/results/Xeon_core
diff --git a/src/gpuwattch/results/Xeon_uncore b/src/accelwattch/results/Xeon_uncore
similarity index 100%
rename from src/gpuwattch/results/Xeon_uncore
rename to src/accelwattch/results/Xeon_uncore
diff --git a/src/gpuwattch/sharedcache.cc b/src/accelwattch/sharedcache.cc
similarity index 100%
rename from src/gpuwattch/sharedcache.cc
rename to src/accelwattch/sharedcache.cc
diff --git a/src/gpuwattch/sharedcache.h b/src/accelwattch/sharedcache.h
similarity index 100%
rename from src/gpuwattch/sharedcache.h
rename to src/accelwattch/sharedcache.h
diff --git a/src/gpuwattch/technology_xeon_core.cc b/src/accelwattch/technology_xeon_core.cc
similarity index 100%
rename from src/gpuwattch/technology_xeon_core.cc
rename to src/accelwattch/technology_xeon_core.cc
diff --git a/src/gpuwattch/version.h b/src/accelwattch/version.h
similarity index 100%
rename from src/gpuwattch/version.h
rename to src/accelwattch/version.h
diff --git a/src/gpuwattch/xmlParser.cc b/src/accelwattch/xmlParser.cc
similarity index 100%
rename from src/gpuwattch/xmlParser.cc
rename to src/accelwattch/xmlParser.cc
diff --git a/src/gpuwattch/xmlParser.h b/src/accelwattch/xmlParser.h
similarity index 100%
rename from src/gpuwattch/xmlParser.h
rename to src/accelwattch/xmlParser.h
diff --git a/src/cuda-sim/cuda-sim.cc b/src/cuda-sim/cuda-sim.cc
index 71f0703ac..f9e5db314 100644
--- a/src/cuda-sim/cuda-sim.cc
+++ b/src/cuda-sim/cuda-sim.cc
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
-// George L. Yuan, Jimmy Kwa
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
+// George L. Yuan, Jimmy Kwa, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -588,65 +589,119 @@ void ptx_instruction::set_fp_or_int_archop() {
       oprnd_type = INT_OP;
   }
 }
-void ptx_instruction::set_mul_div_or_other_archop() {
-  sp_op = OTHER_OP;
-  if ((m_opcode != MEMBAR_OP) && (m_opcode != SSY_OP) && (m_opcode != BRA_OP) &&
-      (m_opcode != BAR_OP) && (m_opcode != EXIT_OP) && (m_opcode != NOP_OP) &&
-      (m_opcode != RETP_OP) && (m_opcode != RET_OP) && (m_opcode != CALLP_OP) &&
-      (m_opcode != CALL_OP)) {
-    if (get_type() == F32_TYPE || get_type() == F64_TYPE ||
-        get_type() == FF64_TYPE) {
-      switch (get_opcode()) {
-        case MUL_OP:
-        case MAD_OP:
-          sp_op = FP_MUL_OP;
-          break;
-        case DIV_OP:
-          sp_op = FP_DIV_OP;
-          break;
-        case LG2_OP:
-          sp_op = FP_LG_OP;
-          break;
-        case RSQRT_OP:
-        case SQRT_OP:
-          sp_op = FP_SQRT_OP;
-          break;
-        case RCP_OP:
-          sp_op = FP_DIV_OP;
-          break;
-        case SIN_OP:
-        case COS_OP:
-          sp_op = FP_SIN_OP;
-          break;
-        case EX2_OP:
-          sp_op = FP_EXP_OP;
-          break;
-        default:
-          if ((op == ALU_OP) || (op == TENSOR_CORE_OP)) sp_op = FP__OP;
-          break;
+
+void ptx_instruction::set_mul_div_or_other_archop(){
+  sp_op=OTHER_OP;
+  if((m_opcode != MEMBAR_OP) && (m_opcode != SSY_OP) && (m_opcode != BRA_OP) && (m_opcode != BAR_OP) && (m_opcode != EXIT_OP) && (m_opcode != NOP_OP) && (m_opcode != RETP_OP) && (m_opcode != RET_OP) && (m_opcode != CALLP_OP) && (m_opcode != CALL_OP)){
+    if(get_type() == F64_TYPE || get_type() == FF64_TYPE){
+         switch(get_opcode()){
+            case MUL_OP:
+            case MAD_OP:
+            case FMA_OP:
+                sp_op=DP_MUL_OP;
+               break;
+            case DIV_OP:
+            case REM_OP:
+                sp_op=DP_DIV_OP;
+               break;
+            case RCP_OP:
+                sp_op=DP_DIV_OP;
+               break;
+            case LG2_OP:
+                sp_op=FP_LG_OP;
+               break;
+            case RSQRT_OP:
+            case SQRT_OP:
+                sp_op=FP_SQRT_OP;
+               break;            
+            case SIN_OP:
+            case COS_OP:
+                sp_op=FP_SIN_OP;
+               break;
+            case EX2_OP:
+                sp_op=FP_EXP_OP;
+               break;
+            case MMA_OP:
+                sp_op=TENSOR__OP;
+            break;
+            case TEX_OP:
+                sp_op=TEX__OP;
+            break;
+            default:
+               if((op==DP_OP) || (op==ALU_OP))
+                  sp_op=DP___OP;
+               break;
+         }
       }
-    } else {
-      switch (get_opcode()) {
-        case MUL24_OP:
-        case MAD24_OP:
-          sp_op = INT_MUL24_OP;
-          break;
-        case MUL_OP:
-        case MAD_OP:
-          if (get_type() == U32_TYPE || get_type() == S32_TYPE ||
-              get_type() == B32_TYPE)
-            sp_op = INT_MUL32_OP;
-          else
-            sp_op = INT_MUL_OP;
-          break;
-        case DIV_OP:
-          sp_op = INT_DIV_OP;
-          break;
-        default:
-          if ((op == ALU_OP)) sp_op = INT__OP;
-          break;
+      else if(get_type()==F16_TYPE || get_type()==F32_TYPE){
+         switch(get_opcode()){
+            case MUL_OP:
+            case MAD_OP:
+            case FMA_OP:
+                sp_op=FP_MUL_OP;
+               break;
+            case DIV_OP:
+            case REM_OP:
+                sp_op=FP_DIV_OP;
+               break;
+            case RCP_OP:
+                sp_op=FP_DIV_OP;
+               break;
+            case LG2_OP:
+                sp_op=FP_LG_OP;
+               break;
+            case RSQRT_OP:
+            case SQRT_OP:
+                sp_op=FP_SQRT_OP;
+               break;            
+            case SIN_OP:
+            case COS_OP:
+                sp_op=FP_SIN_OP;
+               break;
+            case EX2_OP:
+                sp_op=FP_EXP_OP;
+               break;
+            case MMA_OP:
+                sp_op=TENSOR__OP;
+            break;
+            case TEX_OP:
+                sp_op=TEX__OP;
+            break;
+            default:
+               if((op==SP_OP) || (op==ALU_OP))
+                  sp_op=FP__OP;
+               break;
+         }
+      }else {
+         switch(get_opcode()){
+            case MUL24_OP:
+            case MAD24_OP:
+                sp_op=INT_MUL24_OP;
+            break;
+            case MUL_OP:
+            case MAD_OP:
+            case FMA_OP:
+               if(get_type()==U32_TYPE || get_type()==S32_TYPE || get_type()==B32_TYPE)
+                   sp_op=INT_MUL32_OP;
+               else
+                   sp_op=INT_MUL_OP;
+            break;
+            case DIV_OP:
+            case REM_OP:
+                sp_op=INT_DIV_OP;
+            break;
+            case MMA_OP:
+                sp_op=TENSOR__OP;
+            break;
+            case TEX_OP:
+                sp_op=TEX__OP;
+            break;
+            default:
+               if((op==INTP_OP) || (op==ALU_OP))
+                   sp_op=INT__OP;
+               break;
+         }
       }
-    }
   }
 }
 
@@ -880,6 +935,7 @@ void ptx_instruction::set_opcode_and_latency() {
     case MAD_OP:
     case MADC_OP:
     case MADP_OP:
+    case FMA_OP:
       // MAD latency
       switch (get_type()) {
         case F32_TYPE:
@@ -903,7 +959,18 @@ void ptx_instruction::set_opcode_and_latency() {
           break;
       }
       break;
+    case MUL24_OP: //MUL24 is performed on mul32 units (with additional instructions for bitmasking) on devices with compute capability >1.x
+      latency = int_latency[2]+1;
+      initiation_interval = int_init[2]+1;
+      op = INTP_OP;
+      break;
+    case MAD24_OP:
+      latency = int_latency[3]+1;
+      initiation_interval = int_init[3]+1;
+      op = INTP_OP;
+      break;
     case DIV_OP:
+    case REM_OP:
       // Floating point only
       op = SFU_OP;
       switch (get_type()) {
diff --git a/src/cuda-sim/instructions.cc b/src/cuda-sim/instructions.cc
index 8936fa80e..44afbe5aa 100644
--- a/src/cuda-sim/instructions.cc
+++ b/src/cuda-sim/instructions.cc
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
-// Jimmy Kwa, George L. Yuan
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
+// Jimmy Kwa, George L. Yuan, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -26,6 +27,7 @@
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
+
 #include "instructions.h"
 #include "half.h"
 #include "half.hpp"
@@ -166,8 +168,9 @@ void inst_not_implemented(const ptx_instruction *pI);
 ptx_reg_t srcOperandModifiers(ptx_reg_t opData, operand_info opInfo,
                               operand_info dstInfo, unsigned type,
                               ptx_thread_info *thread);
-                              
-void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread, int op_code);
+
+void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread,
+                           int op_code);
 
 void sign_extend(ptx_reg_t &data, unsigned src_size, const operand_info &dst);
 
@@ -1711,40 +1714,50 @@ void bfi_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
   }
   thread->set_operand_value(dst, data, i_type, thread, pI);
 }
-void bfind_impl(const ptx_instruction *pI, ptx_thread_info *thread)
-{
-  const operand_info &dst  = pI->dst();
+void bfind_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
+  const operand_info &dst = pI->dst();
   const operand_info &src1 = pI->src1();
   const unsigned i_type = pI->get_type();
 
-  const ptx_reg_t src1_data = thread->get_operand_value(src1, dst, i_type, thread, 1);
-  const int msb = ( i_type == U32_TYPE || i_type == S32_TYPE) ? 31 : 63;
+  const ptx_reg_t src1_data =
+      thread->get_operand_value(src1, dst, i_type, thread, 1);
+  const int msb = (i_type == U32_TYPE || i_type == S32_TYPE) ? 31 : 63;
 
   unsigned long a = 0;
-  switch (i_type)
-  {
-    case S32_TYPE: a = src1_data.s32; break;
-    case U32_TYPE: a = src1_data.u32; break;
-    case S64_TYPE: a = src1_data.s64; break;
-    case U64_TYPE: a = src1_data.u64; break;
-    default: assert(false); abort();
+  switch (i_type) {
+    case S32_TYPE:
+      a = src1_data.s32;
+      break;
+    case U32_TYPE:
+      a = src1_data.u32;
+      break;
+    case S64_TYPE:
+      a = src1_data.s64;
+      break;
+    case U64_TYPE:
+      a = src1_data.u64;
+      break;
+    default:
+      assert(false);
+      abort();
   }
 
   // negate negative signed inputs
-  if ( ( i_type == S32_TYPE || i_type == S64_TYPE ) && ( a & ( 1 << msb ) ) ) {
-      a = ~a;
+  if ((i_type == S32_TYPE || i_type == S64_TYPE) && (a & (1 << msb))) {
+    a = ~a;
   }
   uint32_t d_data = 0xffffffff;
   for (uint32_t i = msb; i >= 0; i--) {
-      if (a & (1<<i))  { d_data = i; break; }
+    if (a & (1 << i)) {
+      d_data = i;
+      break;
+    }
   }
 
   // if (.shiftamt && d != 0xffffffff)  { d = msb - d; }
 
   // store d
   thread->set_operand_value(dst, d_data, U32_TYPE, thread, pI);
-
-
 }
 
 void bra_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
@@ -3966,7 +3979,7 @@ void mad_def(const ptx_instruction *pI, ptx_thread_info *thread,
           fesetround(FE_TOWARDZERO);
           break;
         default:
-          assert(0);
+          //assert(0);
           break;
       }
       d.f32 = a.f32 * b.f32 + c.f32;
@@ -4312,11 +4325,8 @@ void mul_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
     case S64_TYPE:
       t.s64 = a.s64 * b.s64;
       assert(!pI->is_wide());
-      assert(!pI->is_hi());
-      if (pI->is_lo())
-        d.s64 = t.s64;
-      else
-        assert(0);
+      //assert(!pI->is_hi());
+      d.s64 = t.s64;
       break;
     case U16_TYPE:
       t.u32 = ((unsigned)a.u16) * ((unsigned)b.u16);
@@ -6339,12 +6349,10 @@ void vmad_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
 #define VMAX 0
 #define VMIN 1
 
-void vmax_impl(const ptx_instruction *pI, ptx_thread_info *thread)
-{
-   video_mem_instruction(pI, thread, VMAX);
+void vmax_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
+  video_mem_instruction(pI, thread, VMAX);
 }
-void vmin_impl(const ptx_instruction *pI, ptx_thread_info *thread)
-{
+void vmin_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
   video_mem_instruction(pI, thread, VMIN);
 }
 void vset_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
@@ -6440,12 +6448,12 @@ void vote_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
   }
 }
 
-void activemask_impl( const ptx_instruction *pI, ptx_thread_info *thread )
-{
+void activemask_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
   active_mask_t l_activemask_bitset = pI->get_warp_active_mask();
-  uint32_t l_activemask_uint = static_cast<uint32_t>(l_activemask_bitset.to_ulong());
+  uint32_t l_activemask_uint =
+      static_cast<uint32_t>(l_activemask_bitset.to_ulong());
 
-  const operand_info &dst  = pI->dst();
+  const operand_info &dst = pI->dst();
   thread->set_operand_value(dst, l_activemask_uint, U32_TYPE, thread, pI);
 }
 
@@ -6527,12 +6535,12 @@ ptx_reg_t srcOperandModifiers(ptx_reg_t opData, operand_info opInfo,
   return result;
 }
 
-void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread, int op_code)
-{
-  const operand_info &dst  = pI->dst(); // d
-  const operand_info &src1 = pI->src1(); // a
-  const operand_info &src2 = pI->src2(); // b
-  const operand_info &src3 = pI->src3(); // c
+void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread,
+                           int op_code) {
+  const operand_info &dst = pI->dst();    // d
+  const operand_info &src1 = pI->src1();  // a
+  const operand_info &src2 = pI->src2();  // b
+  const operand_info &src3 = pI->src3();  // c
 
   const unsigned i_type = pI->get_type();
 
@@ -6557,19 +6565,18 @@ void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread, i
   auto option = options.begin();
   assert(*option == ATOMIC_MAX || *option == ATOMIC_MIN);
 
-  switch ( i_type ) {
+  switch (i_type) {
     case S32_TYPE: {
       // assert all operands are S32_TYPE:
       scalar_type = pI->get_scalar_type();
-      for (std::list<int>::iterator scalar = scalar_type.begin(); scalar != scalar_type.end(); scalar++)
-      {
+      for (std::list<int>::iterator scalar = scalar_type.begin();
+           scalar != scalar_type.end(); scalar++) {
         assert(*scalar == S32_TYPE);
       }
       assert(scalar_type.size() == 3);
       scalar_type.clear();
 
-      switch (op_code)
-      {
+      switch (op_code) {
         case VMAX:
           data.s32 = MY_MAX_I(ta.s32, tb.s32);
           break;
@@ -6580,26 +6587,23 @@ void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread, i
           assert(0);
       }
 
-      switch (*option)
-      {
+      switch (*option) {
         case ATOMIC_MAX:
           data.s32 = MY_MAX_I(data.s32, c.s32);
-        break;
+          break;
         case ATOMIC_MIN:
           data.s32 = MY_MIN_I(data.s32, c.s32);
-        break;
+          break;
         default:
-          assert(0); // not yet implemented
+          assert(0);  // not yet implemented
       }
       break;
-
     }
     default:
-      assert(0); // not yet implemented
+      assert(0);  // not yet implemented
   }
 
   thread->set_operand_value(dst, data, i_type, thread, pI);
 
   return;
 }
-
diff --git a/src/cuda-sim/ptx.l b/src/cuda-sim/ptx.l
index 675404597..7706f0b31 100644
--- a/src/cuda-sim/ptx.l
+++ b/src/cuda-sim/ptx.l
@@ -1,32 +1,34 @@
 /*
-Copyright (c) 2009-2011, Tor M. Aamodt
-The University of British Columbia
+Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas
+The University of British Columbia, Northwestern University
 All rights reserved.
-
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
-Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-Redistributions in binary form must reproduce the above copyright notice, this
-list of conditions and the following disclaimer in the documentation and/or
-other materials provided with the distribution.
-Neither the name of The University of British Columbia nor the names of its
-contributors may be used to endorse or promote products derived from this
-software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of The University of British Columbia, Northwestern 
+   University nor the names of their contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
 */
 
+
 %option nounput
 %option noyywrap
 %option yylineno
@@ -69,6 +71,7 @@ andn	TC; yylval->int_value = ANDN_OP; return OPCODE;
 atom	TC; yylval->int_value = ATOM_OP; return OPCODE;
 bar.warp 	TC; yylval->int_value = NOP_OP; return OPCODE;
 bar 	TC; yylval->int_value = BAR_OP; return OPCODE;
+barrier	TC; yylval->int_value = BAR_OP; return OPCODE;
 bfe     TC; yylval->int_value = BFE_OP; return OPCODE;
 bfi     TC; yylval->int_value = BFI_OP; return OPCODE;
 bfind   TC; yylval->int_value = BFIND_OP; return OPCODE;
@@ -167,14 +170,22 @@ breakaddr  TC; yylval->int_value = BREAKADDR_OP; return OPCODE;
 "CPTX_END"	printf("ENDING CUSTOM PTX.\n"); BEGIN(IN_COMMENT);
 
 <INITIAL,NOT_OPCODE,IN_INST,IN_FUNC_DECL>{
-\.a\.sync TC; yylval->int_value = LOAD_A; return WMMA_DIRECTIVE;
-\.b\.sync TC; yylval->int_value = LOAD_B; return WMMA_DIRECTIVE;
-\.c\.sync TC; yylval->int_value = LOAD_C; return WMMA_DIRECTIVE;
-\.d\.sync TC; yylval->int_value = STORE_D; return WMMA_DIRECTIVE;
-\.mma\.sync TC;yylval->int_value=MMA; return WMMA_DIRECTIVE;
+\.a\.sync\.aligned TC; yylval->int_value = LOAD_A; return WMMA_DIRECTIVE;
+\.b\.sync\.aligned TC; yylval->int_value = LOAD_B; return WMMA_DIRECTIVE;
+\.c\.sync\.aligned TC; yylval->int_value = LOAD_C; return WMMA_DIRECTIVE;
+\.d\.sync\.aligned TC; yylval->int_value = STORE_D; return WMMA_DIRECTIVE;
+\.mma\.sync\.aligned TC;yylval->int_value=MMA; return WMMA_DIRECTIVE;
 
 \.row TC; yylval->int_value = ROW; return LAYOUT;
 \.col TC; yylval->int_value = COL; return LAYOUT;
+\.m16n16k16\.global TC; yylval->int_value = M16N16K16; return CONFIGURATION;
+\.m32n8k16\.global TC; yylval->int_value = M32N8K16; return CONFIGURATION;
+\.m8n32k16\.global TC;  yylval->int_value = M8N32K16; return CONFIGURATION;
+
+\.m16n16k16\.shared TC; yylval->int_value = M16N16K16; return CONFIGURATION;
+\.m32n8k16\.shared TC; yylval->int_value = M32N8K16; return CONFIGURATION;
+\.m8n32k16\.shared TC;  yylval->int_value = M8N32K16; return CONFIGURATION;
+
 \.m16n16k16 TC; yylval->int_value = M16N16K16; return CONFIGURATION;
 \.m32n8k16 TC; yylval->int_value = M32N8K16; return CONFIGURATION;
 \.m8n32k16 TC;  yylval->int_value = M8N32K16; return CONFIGURATION;
@@ -476,4 +487,4 @@ int ptx_error( yyscan_t yyscanner, ptx_recognizer* recognizer, const char *s )
 	fflush(stdout);
 	//exit(1);
 	return 0;
-}
+}
\ No newline at end of file
diff --git a/src/cuda-sim/ptx_ir.cc b/src/cuda-sim/ptx_ir.cc
index e5b5fb773..2edc1ed56 100644
--- a/src/cuda-sim/ptx_ir.cc
+++ b/src/cuda-sim/ptx_ir.cc
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
-// George L. Yuan
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
+// George L. Yuan, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -1147,8 +1148,8 @@ static std::list<operand_info> check_operands(
     const std::list<operand_info> &operands, gpgpu_context *ctx) {
   static int g_warn_literal_operands_two_type_inst;
   if ((opcode == CVT_OP) || (opcode == SET_OP) || (opcode == SLCT_OP) ||
-      (opcode == TEX_OP) || (opcode == MMA_OP) || (opcode == DP4A_OP) || 
-      (opcode == VMIN_OP) || (opcode == VMAX_OP) ) {
+      (opcode == TEX_OP) || (opcode == MMA_OP) || (opcode == DP4A_OP) ||
+      (opcode == VMIN_OP) || (opcode == VMAX_OP)) {
     // just make sure these do not have have const operands...
     if (!g_warn_literal_operands_two_type_inst) {
       std::list<operand_info>::const_iterator o;
@@ -1384,6 +1385,8 @@ ptx_instruction::ptx_instruction(
       case CS_OPTION:
       case LU_OPTION:
       case CV_OPTION:
+      case WB_OPTION: 
+      case WT_OPTION:
         m_cache_option = last_ptx_inst_option;
         break;
       case HALF_OPTION:
diff --git a/src/cuda-sim/ptx_ir.h b/src/cuda-sim/ptx_ir.h
index 42439412c..825175964 100644
--- a/src/cuda-sim/ptx_ir.h
+++ b/src/cuda-sim/ptx_ir.h
@@ -966,8 +966,8 @@ class ptx_instruction : public warp_inst_t {
   int get_pred_mod() const { return m_pred_mod; }
   const char *get_source() const { return m_source.c_str(); }
 
-  const std::list<int> get_scalar_type() const {return m_scalar_type;}
-  const std::list<int> get_options() const {return m_options;}
+  const std::list<int> get_scalar_type() const { return m_scalar_type; }
+  const std::list<int> get_options() const { return m_options; }
 
   typedef std::vector<operand_info>::const_iterator const_iterator;
 
diff --git a/src/cuda-sim/ptx_loader.cc b/src/cuda-sim/ptx_loader.cc
index 4e91763e8..fa304b316 100644
--- a/src/cuda-sim/ptx_loader.cc
+++ b/src/cuda-sim/ptx_loader.cc
@@ -354,7 +354,7 @@ void gpgpu_context::gpgpu_ptx_info_load_from_filename(const char *filename,
     snprintf(extra_flags, 1024, "--compile-only --gpu-name=sm_%u", sm_version);
   snprintf(
       buff, 1024,
-      "$CUDA_INSTALL_PATH/bin/ptxas %s -v %s --output-file  /dev/null 2> %s",
+      "$CUDA_INSTALL_PATH/bin/ptxas -w %s -v %s --output-file  /dev/null 2> %s",
       extra_flags, filename, ptxas_filename.c_str());
   int result = system(buff);
   if (result != 0) {
@@ -441,7 +441,7 @@ void gpgpu_context::gpgpu_ptxinfo_load_from_string(const char *p_for_info,
 #endif
 
     snprintf(commandline, 1024,
-             "$PTXAS_CUDA_INSTALL_PATH/bin/ptxas %s -v %s --output-file  "
+             "$PTXAS_CUDA_INSTALL_PATH/bin/ptxas -w %s -v %s --output-file  "
              "/dev/null 2> %s",
              extra_flags, fname2, tempfile_ptxinfo);
     printf("GPGPU-Sim PTX: generating ptxinfo using \"%s\"\n", commandline);
@@ -460,7 +460,7 @@ void gpgpu_context::gpgpu_ptxinfo_load_from_string(const char *p_for_info,
 
         fix_duplicate_errors(fname2);
         snprintf(commandline, 1024,
-                 "$CUDA_INSTALL_PATH/bin/ptxas %s -v %s --output-file  "
+                 "$CUDA_INSTALL_PATH/bin/ptxas -w %s -v %s --output-file  "
                  "/dev/null 2> %s",
                  extra_flags, fname2, tempfile_ptxinfo);
         printf("GPGPU-Sim PTX: regenerating ptxinfo using \"%s\"\n",
@@ -524,7 +524,7 @@ void gpgpu_context::gpgpu_ptxinfo_load_from_string(const char *p_for_info,
 
     snprintf(
         commandline, 1024,
-        "$CUDA_INSTALL_PATH/bin/ptxas %s -v %s --output-file  /dev/null 2> %s",
+        "$CUDA_INSTALL_PATH/bin/ptxas -w %s -v %s --output-file  /dev/null 2> %s",
         extra_flags, fname2, tempfile_ptxinfo);
     printf("GPGPU-Sim PTX: generating ptxinfo using \"%s\"\n", commandline);
     fflush(stdout);
diff --git a/src/cuda-sim/ptx_parser.cc b/src/cuda-sim/ptx_parser.cc
index afdb41ba8..86a33c2d3 100644
--- a/src/cuda-sim/ptx_parser.cc
+++ b/src/cuda-sim/ptx_parser.cc
@@ -622,13 +622,13 @@ void ptx_recognizer::add_scalar_type_spec(int type_spec) {
                     g_ptx_token_decode[type_spec].c_str());
   g_scalar_type.push_back(type_spec);
   if (g_scalar_type.size() > 1) {
-    parse_assert((g_opcode == -1) || (g_opcode == CVT_OP) ||
-                     (g_opcode == SET_OP) || (g_opcode == SLCT_OP) ||
-                     (g_opcode == TEX_OP) || (g_opcode == MMA_OP) ||
-                     (g_opcode == DP4A_OP) || (g_opcode == VMIN_OP) || 
-                     (g_opcode == VMAX_OP),
-                 "only cvt, set, slct, tex, vmin, vmax and dp4a can have more than one "
-                 "type specifier.");
+    parse_assert(
+        (g_opcode == -1) || (g_opcode == CVT_OP) || (g_opcode == SET_OP) ||
+            (g_opcode == SLCT_OP) || (g_opcode == TEX_OP) ||
+            (g_opcode == MMA_OP) || (g_opcode == DP4A_OP) ||
+            (g_opcode == VMIN_OP) || (g_opcode == VMAX_OP),
+        "only cvt, set, slct, tex, vmin, vmax and dp4a can have more than one "
+        "type specifier.");
   }
   g_scalar_type_spec = type_spec;
 }
diff --git a/src/gpgpu-sim/delayqueue.h b/src/gpgpu-sim/delayqueue.h
index 1cf418529..f1ad66073 100644
--- a/src/gpgpu-sim/delayqueue.h
+++ b/src/gpgpu-sim/delayqueue.h
@@ -154,6 +154,7 @@ class fifo_pipeline {
   }
 
   bool full() const { return (m_max_len && m_length >= m_max_len); }
+  bool full(int n) const { return (m_max_len && m_length + n > m_max_len); }
   bool is_avilable_size(unsigned size) const {
     return (m_max_len && m_length + size - 1 >= m_max_len);
   }
diff --git a/src/gpgpu-sim/dram.cc b/src/gpgpu-sim/dram.cc
index ca47c4684..6360c6726 100644
--- a/src/gpgpu-sim/dram.cc
+++ b/src/gpgpu-sim/dram.cc
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
-// Ivan Sham, George L. Yuan,
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
+// Ivan Sham, George L. Yuan, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -263,7 +264,8 @@ void dram_t::push(class mem_fetch *data) {
     max_mrqs_temp = (max_mrqs_temp > mrqq->get_length()) ? max_mrqs_temp
                                                          : mrqq->get_length();
   }
-  m_stats->memlatstat_dram_access(data);
+  if (data->get_sid() < 80)
+    m_stats->memlatstat_dram_access(data);
 }
 
 void dram_t::scheduler_fifo() {
@@ -855,7 +857,7 @@ void dram_t::visualizer_print(gzFile visualizer_file) {
 
 void dram_t::set_dram_power_stats(unsigned &cmd, unsigned &activity,
                                   unsigned &nop, unsigned &act, unsigned &pre,
-                                  unsigned &rd, unsigned &wr,
+                                  unsigned &rd, unsigned &wr, unsigned &wr_WB,
                                   unsigned &req) const {
   // Point power performance counters to low-level DRAM counters
   cmd = n_cmd;
@@ -865,6 +867,7 @@ void dram_t::set_dram_power_stats(unsigned &cmd, unsigned &activity,
   pre = n_pre;
   rd = n_rd;
   wr = n_wr;
+  wr_WB = n_wr_WB;
   req = n_req;
 }
 
diff --git a/src/gpgpu-sim/dram.h b/src/gpgpu-sim/dram.h
index 6c212e9be..88e46ed7b 100644
--- a/src/gpgpu-sim/dram.h
+++ b/src/gpgpu-sim/dram.h
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ivan Sham, Ali Bakhoda,
-// George L. Yuan, Wilson W.L. Fung
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ivan Sham, Ali Bakhoda,
+// George L. Yuan, Wilson W.L. Fung, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -135,7 +136,7 @@ class dram_t {
   // Power Model
   void set_dram_power_stats(unsigned &cmd, unsigned &activity, unsigned &nop,
                             unsigned &act, unsigned &pre, unsigned &rd,
-                            unsigned &wr, unsigned &req) const;
+                            unsigned &wr, unsigned &wr_WB, unsigned &req) const;
 
   const memory_config *m_config;
 
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 75c369136..62966155b 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Tayler Hetherington
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -37,7 +38,8 @@
 
 const char *cache_request_status_str(enum cache_request_status status) {
   static const char *static_cache_request_status_str[] = {
-      "HIT", "HIT_RESERVED", "MISS", "RESERVATION_FAIL", "SECTOR_MISS"};
+      "HIT",         "HIT_RESERVED", "MISS", "RESERVATION_FAIL",
+      "SECTOR_MISS", "MSHR_HIT"};
 
   assert(sizeof(static_cache_request_status_str) / sizeof(const char *) ==
          NUM_CACHE_REQUEST_STATUS);
@@ -63,9 +65,9 @@ unsigned l1d_cache_config::set_bank(new_addr_type addr) const {
   // For sector cache, we select one sector per bank (sector interleaving)
   // This is what was found in Volta (one sector per bank, sector interleaving)
   // otherwise, line interleaving
-  return cache_config::hash_function(addr, l1_banks, l1_banks_byte_interleaving,
-                                     m_l1_banks_log2,
-                                     l1_banks_hashing_function);
+  return cache_config::hash_function(addr, l1_banks,
+                                     l1_banks_byte_interleaving_log2,
+                                     l1_banks_log2, l1_banks_hashing_function);
 }
 
 unsigned cache_config::set_index(new_addr_type addr) const {
@@ -210,6 +212,7 @@ void tag_array::init(int core_id, int type_id) {
   m_core_id = core_id;
   m_type_id = type_id;
   is_used = false;
+  m_dirty = 0;
 }
 
 void tag_array::add_pending_line(mem_fetch *mf) {
@@ -231,15 +234,15 @@ void tag_array::remove_pending_line(mem_fetch *mf) {
 }
 
 enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
-                                           mem_fetch *mf,
+                                           mem_fetch *mf, bool is_write,
                                            bool probe_mode) const {
   mem_access_sector_mask_t mask = mf->get_access_sector_mask();
-  return probe(addr, idx, mask, probe_mode, mf);
+  return probe(addr, idx, mask, is_write, probe_mode, mf);
 }
 
 enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
                                            mem_access_sector_mask_t mask,
-                                           bool probe_mode,
+                                           bool is_write, bool probe_mode,
                                            mem_fetch *mf) const {
   // assert( m_config.m_write_policy == READ_ONLY );
   unsigned set_index = m_config.set_index(addr);
@@ -250,7 +253,6 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
   unsigned long long valid_timestamp = (unsigned)-1;
 
   bool all_reserved = true;
-
   // check for hit or pending hit
   for (unsigned way = 0; way < m_config.m_assoc; way++) {
     unsigned index = set_index * m_config.m_assoc + way;
@@ -263,12 +265,15 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
         idx = index;
         return HIT;
       } else if (line->get_status(mask) == MODIFIED) {
-        if (line->is_readable(mask)) {
+        if ((!is_write && line->is_readable(mask)) || is_write) {
           idx = index;
           return HIT;
         } else {
           idx = index;
-          return SECTOR_MISS;
+          if (m_config.m_cache_type == SECTOR)
+            return SECTOR_MISS;
+          else
+            return MISS;
         }
 
       } else if (line->is_valid_line() && line->get_status(mask) == INVALID) {
@@ -279,20 +284,31 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
       }
     }
     if (!line->is_reserved_line()) {
-      all_reserved = false;
-      if (line->is_invalid_line()) {
-        invalid_line = index;
-      } else {
-        // valid line : keep track of most appropriate replacement candidate
-        if (m_config.m_replacement_policy == LRU) {
-          if (line->get_last_access_time() < valid_timestamp) {
-            valid_timestamp = line->get_last_access_time();
-            valid_line = index;
-          }
-        } else if (m_config.m_replacement_policy == FIFO) {
-          if (line->get_alloc_time() < valid_timestamp) {
-            valid_timestamp = line->get_alloc_time();
-            valid_line = index;
+      // percentage of dirty lines in the cache
+      // number of dirty lines / total lines in the cache
+      float dirty_line_percentage =
+          ((float)m_dirty / (m_config.m_nset * m_config.m_assoc)) * 100;
+      // If the cacheline is from a load op (not modified), 
+      // or the total dirty cacheline is above a specific value,
+      // Then this cacheline is eligible to be considered for replacement candidate
+      // i.e. Only evict clean cachelines until total dirty cachelines reach the limit.
+      if (!line->is_modified_line() ||
+          dirty_line_percentage >= m_config.m_wr_percent) {
+        all_reserved = false;
+        if (line->is_invalid_line()) {
+          invalid_line = index;
+        } else {
+          // valid line : keep track of most appropriate replacement candidate
+          if (m_config.m_replacement_policy == LRU) {
+            if (line->get_last_access_time() < valid_timestamp) {
+              valid_timestamp = line->get_last_access_time();
+              valid_line = index;
+            }
+          } else if (m_config.m_replacement_policy == FIFO) {
+            if (line->get_alloc_time() < valid_timestamp) {
+              valid_timestamp = line->get_alloc_time();
+              valid_line = index;
+            }
           }
         }
       }
@@ -312,15 +328,6 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
     abort();  // if an unreserved block exists, it is either invalid or
               // replaceable
 
-  if (probe_mode && m_config.is_streaming()) {
-    line_table::const_iterator i =
-        pending_lines.find(m_config.block_addr(addr));
-    assert(mf);
-    if (!mf->is_write() && i != pending_lines.end()) {
-      if (i->second != mf->get_inst().get_uid()) return SECTOR_MISS;
-    }
-  }
-
   return MISS;
 }
 
@@ -328,7 +335,7 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
                                             unsigned &idx, mem_fetch *mf) {
   bool wb = false;
   evicted_block_info evicted;
-  enum cache_request_status result = access(addr, time, idx, wb, evicted, mf);
+  enum cache_request_status result = access(addr, time, idx, wb, evicted, mf, false);
   assert(!wb);
   return result;
 }
@@ -336,11 +343,13 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
 enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
                                             unsigned &idx, bool &wb,
                                             evicted_block_info &evicted,
-                                            mem_fetch *mf) {
+                                            mem_fetch *mf, bool mshr_hit_avail) {
   m_access++;
   is_used = true;
   shader_cache_access_log(m_core_id, m_type_id, 0);  // log accesses to cache
-  enum cache_request_status status = probe(addr, idx, mf);
+  enum cache_request_status status = probe(addr, idx, mf, mf->is_write());
+  if (mshr_hit_avail && status == MISS && !mf->get_is_write())
+    status = HIT_RESERVED;
   switch (status) {
     case HIT_RESERVED:
       m_pending_hit++;
@@ -353,8 +362,12 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
       if (m_config.m_alloc_policy == ON_MISS) {
         if (m_lines[idx]->is_modified_line()) {
           wb = true;
+          // m_lines[idx]->set_byte_mask(mf);
           evicted.set_info(m_lines[idx]->m_block_addr,
-                           m_lines[idx]->get_modified_size());
+                           m_lines[idx]->get_modified_size(),
+                           m_lines[idx]->get_dirty_byte_mask(),
+                           m_lines[idx]->get_dirty_sector_mask());
+          m_dirty--;
         }
         m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr),
                                time, mf->get_access_sector_mask());
@@ -365,8 +378,12 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
       m_sector_miss++;
       shader_cache_access_log(m_core_id, m_type_id, 1);  // log cache misses
       if (m_config.m_alloc_policy == ON_MISS) {
+        bool before = m_lines[idx]->is_modified_line();
         ((sector_cache_block *)m_lines[idx])
             ->allocate_sector(time, mf->get_access_sector_mask());
+        if (before && !m_lines[idx]->is_modified_line()) {
+          m_dirty--;
+        }
       }
       break;
     case RESERVATION_FAIL:
@@ -383,31 +400,45 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
   return status;
 }
 
-void tag_array::fill(new_addr_type addr, unsigned time, mem_fetch *mf) {
-  fill(addr, time, mf->get_access_sector_mask());
+void tag_array::fill(new_addr_type addr, unsigned time, mem_fetch *mf,
+                     bool is_write) {
+  fill(addr, time, mf->get_access_sector_mask(), mf->get_access_byte_mask(),
+       is_write);
 }
 
 void tag_array::fill(new_addr_type addr, unsigned time,
-                     mem_access_sector_mask_t mask) {
+                     mem_access_sector_mask_t mask,
+                     mem_access_byte_mask_t byte_mask, bool is_write) {
   // assert( m_config.m_alloc_policy == ON_FILL );
   unsigned idx;
-  enum cache_request_status status = probe(addr, idx, mask);
+  enum cache_request_status status = probe(addr, idx, mask, is_write);
+  bool before = m_lines[idx]->is_modified_line();
   // assert(status==MISS||status==SECTOR_MISS); // MSHR should have prevented
   // redundant memory request
-  if (status == MISS)
+  if (status == MISS) {
     m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr), time,
                            mask);
-  else if (status == SECTOR_MISS) {
+  } else if (status == SECTOR_MISS) {
     assert(m_config.m_cache_type == SECTOR);
     ((sector_cache_block *)m_lines[idx])->allocate_sector(time, mask);
   }
-
-  m_lines[idx]->fill(time, mask);
+  if (before && !m_lines[idx]->is_modified_line()) {
+    m_dirty--;
+  }
+  before = m_lines[idx]->is_modified_line();
+  m_lines[idx]->fill(time, mask, byte_mask);
+  if (m_lines[idx]->is_modified_line() && !before) {
+    m_dirty++;
+  }
 }
 
 void tag_array::fill(unsigned index, unsigned time, mem_fetch *mf) {
   assert(m_config.m_alloc_policy == ON_MISS);
-  m_lines[index]->fill(time, mf->get_access_sector_mask());
+  bool before = m_lines[index]->is_modified_line();
+  m_lines[index]->fill(time, mf->get_access_sector_mask(), mf->get_access_byte_mask());
+  if (m_lines[index]->is_modified_line() && !before) {
+    m_dirty++;
+  }
 }
 
 // TODO: we need write back the flushed data to the upper level
@@ -416,10 +447,12 @@ void tag_array::flush() {
 
   for (unsigned i = 0; i < m_config.get_num_lines(); i++)
     if (m_lines[i]->is_modified_line()) {
-      for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++)
+      for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++) {
         m_lines[i]->set_status(INVALID, mem_access_sector_mask_t().set(j));
+      }
     }
 
+  m_dirty = 0;
   is_used = false;
 }
 
@@ -430,6 +463,7 @@ void tag_array::invalidate() {
     for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++)
       m_lines[i]->set_status(INVALID, mem_access_sector_mask_t().set(j));
 
+  m_dirty = 0;
   is_used = false;
 }
 
@@ -485,8 +519,10 @@ bool was_writeback_sent(const std::list<cache_event> &events,
                         cache_event &wb_event) {
   for (std::list<cache_event>::const_iterator e = events.begin();
        e != events.end(); e++) {
-    if ((*e).m_cache_event_type == WRITE_BACK_REQUEST_SENT) wb_event = *e;
-    return true;
+    if ((*e).m_cache_event_type == WRITE_BACK_REQUEST_SENT) {
+      wb_event = *e;
+      return true;
+    }
   }
   return false;
 }
@@ -566,6 +602,7 @@ mem_fetch *mshr_table::next_access() {
   new_addr_type block_addr = m_current_response.front();
   assert(!m_data[block_addr].m_list.empty());
   mem_fetch *result = m_data[block_addr].m_list.front();
+  // printf("cache fill response: data size: %d\taccess size:%d\n", result->get_data_size(), result->get_access_size());
   m_data[block_addr].m_list.pop_front();
   if (m_data[block_addr].m_list.empty()) {
     // release entry
@@ -612,6 +649,7 @@ void cache_stats::clear() {
   ///
   for (unsigned i = 0; i < NUM_MEM_ACCESS_TYPE; ++i) {
     std::fill(m_stats[i].begin(), m_stats[i].end(), 0);
+    std::fill(m_stats_pw[i].begin(), m_stats_pw[i].end(), 0);
     std::fill(m_fail_stats[i].begin(), m_fail_stats[i].end(), 0);
   }
   m_cache_port_available_cycles = 0;
@@ -771,7 +809,9 @@ void cache_stats::print_stats(FILE *fout, const char *cache_name) const {
               cache_request_status_str((enum cache_request_status)status),
               m_stats[type][status]);
 
-      if (status != RESERVATION_FAIL)
+      if (status != RESERVATION_FAIL && status != MSHR_HIT)
+        // MSHR_HIT is a special type of SECTOR_MISS
+        // so its already included in the SECTOR_MISS
         total_access[type] += m_stats[type][status];
     }
   }
@@ -1017,6 +1057,7 @@ bool baseline_cache::bandwidth_management::fill_port_free() const {
 void baseline_cache::cycle() {
   if (!m_miss_queue.empty()) {
     mem_fetch *mf = m_miss_queue.front();
+    // printf("%s cache cycle: data size: %d\taccess size:%d\n", m_name.c_str(), mf->get_data_size(), mf->get_access_size());
     if (!m_memport->full(mf->size(), mf->get_is_write())) {
       m_miss_queue.pop_front();
       m_memport->push(mf);
@@ -1031,6 +1072,7 @@ void baseline_cache::cycle() {
 /// Interface for response from lower memory level (model bandwidth restictions
 /// in caller)
 void baseline_cache::fill(mem_fetch *mf, unsigned time) {
+  // printf("%s cache fill: data size: %d\taccess size:%d\taccess type:%d\n", m_name.c_str(), mf->get_data_size(), mf->get_access_size(), mf->get_access_type());
   if (m_config.m_mshr_type == SECTOR_ASSOC) {
     assert(mf->get_original_mf());
     extra_mf_fields_lookup::iterator e =
@@ -1057,8 +1099,7 @@ void baseline_cache::fill(mem_fetch *mf, unsigned time) {
   if (m_config.m_alloc_policy == ON_MISS)
     m_tag_array->fill(e->second.m_cache_index, time, mf);
   else if (m_config.m_alloc_policy == ON_FILL) {
-    m_tag_array->fill(e->second.m_block_addr, time, mf);
-    if (m_config.is_streaming()) m_tag_array->remove_pending_line(mf);
+    m_tag_array->fill(e->second.m_block_addr, time, mf, mf->is_write());
   } else
     abort();
   bool has_atomic = false;
@@ -1066,9 +1107,13 @@ void baseline_cache::fill(mem_fetch *mf, unsigned time) {
   if (has_atomic) {
     assert(m_config.m_alloc_policy == ON_MISS);
     cache_block_t *block = m_tag_array->get_block(e->second.m_cache_index);
+    if (!block->is_modified_line()) {
+      m_tag_array->inc_dirty();
+    }
     block->set_status(MODIFIED,
                       mf->get_access_sector_mask());  // mark line as dirty for
                                                       // atomic operation
+    block->set_byte_mask(mf);
   }
   m_extra_mf_fields.erase(mf);
   m_bandwidth_management.use_fill_port(mf);
@@ -1120,9 +1165,10 @@ void baseline_cache::send_read_request(new_addr_type addr,
     if (read_only)
       m_tag_array->access(block_addr, time, cache_index, mf);
     else
-      m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
+      m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf, mshr_hit & mshr_avail);
 
     m_mshrs.add(mshr_addr, mf);
+    m_stats.inc_stats(mf->get_access_type(), MSHR_HIT);
     do_miss = true;
 
   } else if (!mshr_hit && mshr_avail &&
@@ -1130,12 +1176,9 @@ void baseline_cache::send_read_request(new_addr_type addr,
     if (read_only)
       m_tag_array->access(block_addr, time, cache_index, mf);
     else
-      m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
+      m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf, mshr_hit & mshr_avail);
 
     m_mshrs.add(mshr_addr, mf);
-    if (m_config.is_streaming() && m_config.m_cache_type == SECTOR) {
-      m_tag_array->add_pending_line(mf);
-    }
     m_extra_mf_fields[mf] = extra_mf_fields(
         mshr_addr, mf->get_addr(), cache_index, mf->get_data_size(), m_config);
     mf->set_data_size(m_config.get_atom_sz());
@@ -1162,6 +1205,25 @@ void data_cache::send_write_request(mem_fetch *mf, cache_event request,
   mf->set_status(m_miss_queue_status, time);
 }
 
+void data_cache::update_m_readable(mem_fetch *mf, unsigned cache_index) {
+  cache_block_t *block = m_tag_array->get_block(cache_index);
+  for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
+    if (mf->get_access_sector_mask().test(i)) {
+      bool all_set = true;
+      for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
+        // If any bit in the byte mask (within the sector) is not set, 
+        // the sector is unreadble
+        if (!block->get_dirty_byte_mask().test(k)) {
+          all_set = false;
+          break;
+        }
+      }
+      if (all_set)
+        block->set_m_readable(true, mf->get_access_sector_mask());
+    }
+  }
+}
+
 /****** Write-hit functions (Set by config file) ******/
 
 /// Write-back hit: Mark block as modified
@@ -1173,7 +1235,12 @@ cache_request_status data_cache::wr_hit_wb(new_addr_type addr,
   new_addr_type block_addr = m_config.block_addr(addr);
   m_tag_array->access(block_addr, time, cache_index, mf);  // update LRU state
   cache_block_t *block = m_tag_array->get_block(cache_index);
+  if (!block->is_modified_line()) {
+    m_tag_array->inc_dirty();
+  }
   block->set_status(MODIFIED, mf->get_access_sector_mask());
+  block->set_byte_mask(mf);
+  update_m_readable(mf,cache_index);
 
   return HIT;
 }
@@ -1192,7 +1259,12 @@ cache_request_status data_cache::wr_hit_wt(new_addr_type addr,
   new_addr_type block_addr = m_config.block_addr(addr);
   m_tag_array->access(block_addr, time, cache_index, mf);  // update LRU state
   cache_block_t *block = m_tag_array->get_block(cache_index);
+  if (!block->is_modified_line()) {
+    m_tag_array->inc_dirty();
+  }
   block->set_status(MODIFIED, mf->get_access_sector_mask());
+  block->set_byte_mask(mf);
+  update_m_readable(mf,cache_index);
 
   // generate a write-through
   send_write_request(mf, cache_event(WRITE_REQUEST_SENT), time, events);
@@ -1285,6 +1357,10 @@ enum cache_request_status data_cache::wr_miss_wa_naive(
                     mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
                     m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
 
+  n_mf->set_data_type(mf->get_data_type());
+  n_mf->set_chip(mf->get_tlx_addr().chip);
+  n_mf->set_parition(mf->get_tlx_addr().sub_partition);
+  assert(n_mf->get_sub_partition_id() == mf->get_sub_partition_id());
   bool do_miss = false;
   bool wb = false;
   evicted_block_info evicted;
@@ -1302,10 +1378,13 @@ enum cache_request_status data_cache::wr_miss_wa_naive(
       assert(status ==
              MISS);  // SECTOR_MISS and HIT_RESERVED should not send write back
       mem_fetch *wb = m_memfetch_creator->alloc(
-          evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+          evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+          evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+          true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+          NULL);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
+      wb->set_data_type(mf->get_data_type());
       wb->set_chip(mf->get_tlx_addr().chip);
       wb->set_parition(mf->get_tlx_addr().sub_partition);
       send_write_request(wb, cache_event(WRITE_BACK_REQUEST_SENT, evicted),
@@ -1337,10 +1416,14 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
     evicted_block_info evicted;
 
     cache_request_status status =
-        m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
+        m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf, false);
     assert(status != HIT);
     cache_block_t *block = m_tag_array->get_block(cache_index);
+    if (!block->is_modified_line()) {
+      m_tag_array->inc_dirty();
+    }
     block->set_status(MODIFIED, mf->get_access_sector_mask());
+    block->set_byte_mask(mf);
     if (status == HIT_RESERVED)
       block->set_ignore_on_fill(true, mf->get_access_sector_mask());
 
@@ -1349,10 +1432,13 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
       // (already modified lower level)
       if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
         mem_fetch *wb = m_memfetch_creator->alloc(
-            evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-            m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+            evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+            evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+            true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+            NULL);
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
+        wb->set_data_type(mf->get_data_type());
         wb->set_chip(mf->get_tlx_addr().chip);
         wb->set_parition(mf->get_tlx_addr().sub_partition);
         send_write_request(wb, cache_event(WRITE_BACK_REQUEST_SENT, evicted),
@@ -1402,6 +1488,9 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
         mf->get_tpc(), mf->get_mem_config(),
         m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, NULL, mf);
 
+    n_mf->set_data_type(mf->get_data_type());
+    n_mf->set_chip(mf->get_tlx_addr().chip);
+    n_mf->set_parition(mf->get_tlx_addr().sub_partition);
     new_addr_type block_addr = m_config.block_addr(addr);
     bool do_miss = false;
     bool wb = false;
@@ -1411,6 +1500,7 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
 
     cache_block_t *block = m_tag_array->get_block(cache_index);
     block->set_modified_on_fill(true, mf->get_access_sector_mask());
+    block->set_byte_mask_on_fill(true);
 
     events.push_back(cache_event(WRITE_ALLOCATE_SENT));
 
@@ -1419,10 +1509,13 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
       // (already modified lower level)
       if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
         mem_fetch *wb = m_memfetch_creator->alloc(
-            evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-            m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+            evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+            evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+            true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+            NULL);
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
+        wb->set_data_type(mf->get_data_type());
         wb->set_chip(mf->get_tlx_addr().chip);
         wb->set_parition(mf->get_tlx_addr().sub_partition);
         send_write_request(wb, cache_event(WRITE_BACK_REQUEST_SENT, evicted),
@@ -1448,34 +1541,49 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
     return RESERVATION_FAIL;  // cannot handle request this cycle
   }
 
+  if (m_config.m_write_policy == WRITE_THROUGH) {
+    send_write_request(mf, cache_event(WRITE_REQUEST_SENT), time, events);
+  }
+
   bool wb = false;
   evicted_block_info evicted;
 
   cache_request_status m_status =
-      m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
+      m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf, false);
   assert(m_status != HIT);
   cache_block_t *block = m_tag_array->get_block(cache_index);
+  if (!block->is_modified_line()) {
+    m_tag_array->inc_dirty();
+  }
   block->set_status(MODIFIED, mf->get_access_sector_mask());
+  block->set_byte_mask(mf);
   if (m_status == HIT_RESERVED) {
     block->set_ignore_on_fill(true, mf->get_access_sector_mask());
     block->set_modified_on_fill(true, mf->get_access_sector_mask());
+    block->set_byte_mask_on_fill(true);
   }
 
   if (mf->get_access_byte_mask().count() == m_config.get_atom_sz()) {
     block->set_m_readable(true, mf->get_access_sector_mask());
   } else {
     block->set_m_readable(false, mf->get_access_sector_mask());
+    if (m_status == HIT_RESERVED)
+      block->set_readable_on_fill(true, mf->get_access_sector_mask());
   }
+  update_m_readable(mf,cache_index);
 
   if (m_status != RESERVATION_FAIL) {
     // If evicted block is modified and not a write-through
     // (already modified lower level)
     if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
       mem_fetch *wb = m_memfetch_creator->alloc(
-          evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+          evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+          evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+          true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+          NULL);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
+      wb->set_data_type(mf->get_data_type());
       wb->set_chip(mf->get_tlx_addr().chip);
       wb->set_parition(mf->get_tlx_addr().sub_partition);
       send_write_request(wb, cache_event(WRITE_BACK_REQUEST_SENT, evicted),
@@ -1516,8 +1624,12 @@ enum cache_request_status data_cache::rd_hit_base(
   if (mf->isatomic()) {
     assert(mf->get_access_type() == GLOBAL_ACC_R);
     cache_block_t *block = m_tag_array->get_block(cache_index);
+    if (!block->is_modified_line()) {
+      m_tag_array->inc_dirty();
+    }
     block->set_status(MODIFIED,
-                      mf->get_access_sector_mask());  // mark line as dirty
+                      mf->get_access_sector_mask());  // mark line as
+    block->set_byte_mask(mf);
   }
   return HIT;
 }
@@ -1548,8 +1660,11 @@ enum cache_request_status data_cache::rd_miss_base(
     // (already modified lower level)
     if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
       mem_fetch *wb = m_memfetch_creator->alloc(
-          evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+          evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+          evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+          true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+          NULL);
+      wb->set_data_type(mf->get_data_type());
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
@@ -1572,7 +1687,7 @@ enum cache_request_status read_only_cache::access(
   new_addr_type block_addr = m_config.block_addr(addr);
   unsigned cache_index = (unsigned)-1;
   enum cache_request_status status =
-      m_tag_array->probe(block_addr, cache_index, mf);
+      m_tag_array->probe(block_addr, cache_index, mf, mf->is_write());
   enum cache_request_status cache_status = RESERVATION_FAIL;
 
   if (status == HIT) {
@@ -1605,6 +1720,18 @@ enum cache_request_status read_only_cache::access(
 //! A general function that takes the result of a tag_array probe
 //  and performs the correspding functions based on the cache configuration
 //  The access fucntion calls this function
+
+enum cache_request_status data_cache::probe(new_addr_type addr, mem_fetch *mf) const {
+  assert(mf->get_data_size() <= m_config.get_atom_sz());
+  bool wr = mf->get_is_write();
+  new_addr_type block_addr = m_config.block_addr(addr);
+  unsigned cache_index = (unsigned)-1;
+  enum cache_request_status probe_status =
+      m_tag_array->probe(block_addr, cache_index, mf, mf->is_write(), true);
+  
+  return probe_status;
+}
+
 enum cache_request_status data_cache::process_tag_probe(
     bool wr, enum cache_request_status probe_status, new_addr_type addr,
     unsigned cache_index, mem_fetch *mf, unsigned time,
@@ -1659,7 +1786,7 @@ enum cache_request_status data_cache::access(new_addr_type addr, mem_fetch *mf,
   new_addr_type block_addr = m_config.block_addr(addr);
   unsigned cache_index = (unsigned)-1;
   enum cache_request_status probe_status =
-      m_tag_array->probe(block_addr, cache_index, mf, true);
+      m_tag_array->probe(block_addr, cache_index, mf, mf->is_write(), true);
   enum cache_request_status access_status =
       process_tag_probe(wr, probe_status, addr, cache_index, mf, time, events);
   m_stats.inc_stats(mf->get_access_type(),
@@ -1688,6 +1815,23 @@ enum cache_request_status l2_cache::access(new_addr_type addr, mem_fetch *mf,
   return data_cache::access(addr, mf, time, events);
 }
 
+enum cache_request_status l2_cache::probe(new_addr_type addr, mem_fetch *mf) const {
+  return data_cache::probe(addr, mf);
+}
+
+// The l2 cache access function calls the base data_cache access
+// implementation.  When the L2 needs to diverge from L1, L2 specific
+// changes should be made here.
+enum cache_request_status meta_cache::access(new_addr_type addr, mem_fetch *mf,
+                                           unsigned time,
+                                           std::list<cache_event> &events) {
+  return data_cache::access(addr, mf, time, events);
+}
+
+enum cache_request_status meta_cache::probe(new_addr_type addr, mem_fetch *mf) const {
+  return data_cache::probe(addr, mf);
+}
+
 /// Access function for tex_cache
 /// return values: RESERVATION_FAIL if request could not be accepted
 /// otherwise returns HIT_RESERVED or MISS; NOTE: *never* returns HIT
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 5c28b41f6..a0899834e 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Tayler Hetherington
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -49,6 +50,7 @@ enum cache_request_status {
   MISS,
   RESERVATION_FAIL,
   SECTOR_MISS,
+  MSHR_HIT,
   NUM_CACHE_REQUEST_STATUS
 };
 
@@ -71,14 +73,26 @@ enum cache_event_type {
 struct evicted_block_info {
   new_addr_type m_block_addr;
   unsigned m_modified_size;
+  mem_access_byte_mask_t m_byte_mask;
+  mem_access_sector_mask_t m_sector_mask;
   evicted_block_info() {
     m_block_addr = 0;
     m_modified_size = 0;
+    m_byte_mask.reset();
+    m_sector_mask.reset();
   }
   void set_info(new_addr_type block_addr, unsigned modified_size) {
     m_block_addr = block_addr;
     m_modified_size = modified_size;
   }
+  void set_info(new_addr_type block_addr, unsigned modified_size,
+                mem_access_byte_mask_t byte_mask,
+                mem_access_sector_mask_t sector_mask) {
+    m_block_addr = block_addr;
+    m_modified_size = modified_size;
+    m_byte_mask = byte_mask;
+    m_sector_mask = sector_mask;
+  }
 };
 
 struct cache_event {
@@ -108,7 +122,8 @@ struct cache_block_t {
   virtual void allocate(new_addr_type tag, new_addr_type block_addr,
                         unsigned time,
                         mem_access_sector_mask_t sector_mask) = 0;
-  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask) = 0;
+  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask,
+                    mem_access_byte_mask_t byte_mask) = 0;
 
   virtual bool is_invalid_line() = 0;
   virtual bool is_valid_line() = 0;
@@ -119,7 +134,10 @@ struct cache_block_t {
       mem_access_sector_mask_t sector_mask) = 0;
   virtual void set_status(enum cache_block_state m_status,
                           mem_access_sector_mask_t sector_mask) = 0;
-
+  virtual void set_byte_mask(mem_fetch *mf) = 0;
+  virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) = 0;
+  virtual mem_access_byte_mask_t get_dirty_byte_mask() = 0;
+  virtual mem_access_sector_mask_t get_dirty_sector_mask() = 0;
   virtual unsigned long long get_last_access_time() = 0;
   virtual void set_last_access_time(unsigned long long time,
                                     mem_access_sector_mask_t sector_mask) = 0;
@@ -128,6 +146,9 @@ struct cache_block_t {
                                   mem_access_sector_mask_t sector_mask) = 0;
   virtual void set_modified_on_fill(bool m_modified,
                                     mem_access_sector_mask_t sector_mask) = 0;
+  virtual void set_readable_on_fill(bool readable,
+                                    mem_access_sector_mask_t sector_mask) = 0;
+  virtual void set_byte_mask_on_fill(bool m_modified) = 0;
   virtual unsigned get_modified_size() = 0;
   virtual void set_m_readable(bool readable,
                               mem_access_sector_mask_t sector_mask) = 0;
@@ -147,6 +168,7 @@ struct line_cache_block : public cache_block_t {
     m_status = INVALID;
     m_ignore_on_fill_status = false;
     m_set_modified_on_fill = false;
+    m_set_readable_on_fill = false;
     m_readable = true;
   }
   void allocate(new_addr_type tag, new_addr_type block_addr, unsigned time,
@@ -159,13 +181,19 @@ struct line_cache_block : public cache_block_t {
     m_status = RESERVED;
     m_ignore_on_fill_status = false;
     m_set_modified_on_fill = false;
+    m_set_readable_on_fill = false;
+    m_set_byte_mask_on_fill = false;
   }
-  void fill(unsigned time, mem_access_sector_mask_t sector_mask) {
+  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask,
+                    mem_access_byte_mask_t byte_mask) {
     // if(!m_ignore_on_fill_status)
     //	assert( m_status == RESERVED );
 
     m_status = m_set_modified_on_fill ? MODIFIED : VALID;
 
+    if (m_set_readable_on_fill) m_readable = true;
+    if (m_set_byte_mask_on_fill) set_byte_mask(byte_mask);
+
     m_fill_time = time;
   }
   virtual bool is_invalid_line() { return m_status == INVALID; }
@@ -181,6 +209,20 @@ struct line_cache_block : public cache_block_t {
                           mem_access_sector_mask_t sector_mask) {
     m_status = status;
   }
+  virtual void set_byte_mask(mem_fetch *mf) {
+    m_dirty_byte_mask = m_dirty_byte_mask | mf->get_access_byte_mask();
+  }
+  virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) {
+    m_dirty_byte_mask = m_dirty_byte_mask | byte_mask;
+  }
+  virtual mem_access_byte_mask_t get_dirty_byte_mask() {
+    return m_dirty_byte_mask;
+  }
+  virtual mem_access_sector_mask_t get_dirty_sector_mask() {
+    mem_access_sector_mask_t sector_mask;
+    if (m_status == MODIFIED) sector_mask.set();
+    return sector_mask;
+  }
   virtual unsigned long long get_last_access_time() {
     return m_last_access_time;
   }
@@ -197,6 +239,13 @@ struct line_cache_block : public cache_block_t {
                                     mem_access_sector_mask_t sector_mask) {
     m_set_modified_on_fill = m_modified;
   }
+  virtual void set_readable_on_fill(bool readable,
+                                    mem_access_sector_mask_t sector_mask) {
+    m_set_readable_on_fill = readable;
+  }
+  virtual void set_byte_mask_on_fill(bool m_modified) {
+    m_set_byte_mask_on_fill = m_modified;
+  }
   virtual unsigned get_modified_size() {
     return SECTOR_CHUNCK_SIZE * SECTOR_SIZE;  // i.e. cache line size
   }
@@ -218,7 +267,10 @@ struct line_cache_block : public cache_block_t {
   cache_block_state m_status;
   bool m_ignore_on_fill_status;
   bool m_set_modified_on_fill;
+  bool m_set_readable_on_fill;
+  bool m_set_byte_mask_on_fill;
   bool m_readable;
+  mem_access_byte_mask_t m_dirty_byte_mask;
 };
 
 struct sector_cache_block : public cache_block_t {
@@ -232,11 +284,13 @@ struct sector_cache_block : public cache_block_t {
       m_status[i] = INVALID;
       m_ignore_on_fill_status[i] = false;
       m_set_modified_on_fill[i] = false;
+      m_set_readable_on_fill[i] = false;
       m_readable[i] = true;
     }
     m_line_alloc_time = 0;
     m_line_last_access_time = 0;
     m_line_fill_time = 0;
+    m_dirty_byte_mask.reset();
   }
 
   virtual void allocate(new_addr_type tag, new_addr_type block_addr,
@@ -261,6 +315,8 @@ struct sector_cache_block : public cache_block_t {
     m_status[sidx] = RESERVED;
     m_ignore_on_fill_status[sidx] = false;
     m_set_modified_on_fill[sidx] = false;
+    m_set_readable_on_fill[sidx] = false;
+    m_set_byte_mask_on_fill = false;
 
     // set line stats
     m_line_alloc_time = time;  // only set this for the first allocated sector
@@ -283,6 +339,8 @@ struct sector_cache_block : public cache_block_t {
     else
       m_set_modified_on_fill[sidx] = false;
 
+    m_set_readable_on_fill[sidx] = false;
+
     m_status[sidx] = RESERVED;
     m_ignore_on_fill_status[sidx] = false;
     // m_set_modified_on_fill[sidx] = false;
@@ -293,14 +351,20 @@ struct sector_cache_block : public cache_block_t {
     m_line_fill_time = 0;
   }
 
-  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask) {
+  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask,
+                    mem_access_byte_mask_t byte_mask) {
     unsigned sidx = get_sector_index(sector_mask);
 
     //	if(!m_ignore_on_fill_status[sidx])
     //	         assert( m_status[sidx] == RESERVED );
-
     m_status[sidx] = m_set_modified_on_fill[sidx] ? MODIFIED : VALID;
 
+    if (m_set_readable_on_fill[sidx]) {
+      m_readable[sidx] = true;
+      m_set_readable_on_fill[sidx] = false;
+    }
+    if (m_set_byte_mask_on_fill) set_byte_mask(byte_mask);
+
     m_sector_fill_time[sidx] = time;
     m_line_fill_time = time;
   }
@@ -340,6 +404,22 @@ struct sector_cache_block : public cache_block_t {
     m_status[sidx] = status;
   }
 
+  virtual void set_byte_mask(mem_fetch *mf) {
+    m_dirty_byte_mask = m_dirty_byte_mask | mf->get_access_byte_mask();
+  }
+  virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) {
+    m_dirty_byte_mask = m_dirty_byte_mask | byte_mask;
+  }
+  virtual mem_access_byte_mask_t get_dirty_byte_mask() {
+    return m_dirty_byte_mask;
+  }
+  virtual mem_access_sector_mask_t get_dirty_sector_mask() {
+    mem_access_sector_mask_t sector_mask;
+    for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
+      if (m_status[i] == MODIFIED) sector_mask.set(i);
+    }
+    return sector_mask;
+  }
   virtual unsigned long long get_last_access_time() {
     return m_line_last_access_time;
   }
@@ -365,7 +445,15 @@ struct sector_cache_block : public cache_block_t {
     unsigned sidx = get_sector_index(sector_mask);
     m_set_modified_on_fill[sidx] = m_modified;
   }
+  virtual void set_byte_mask_on_fill(bool m_modified) {
+    m_set_byte_mask_on_fill = m_modified;
+  }
 
+  virtual void set_readable_on_fill(bool readable,
+                                    mem_access_sector_mask_t sector_mask) {
+    unsigned sidx = get_sector_index(sector_mask);
+    m_set_readable_on_fill[sidx] = readable;
+  }
   virtual void set_m_readable(bool readable,
                               mem_access_sector_mask_t sector_mask) {
     unsigned sidx = get_sector_index(sector_mask);
@@ -400,7 +488,10 @@ struct sector_cache_block : public cache_block_t {
   cache_block_state m_status[SECTOR_CHUNCK_SIZE];
   bool m_ignore_on_fill_status[SECTOR_CHUNCK_SIZE];
   bool m_set_modified_on_fill[SECTOR_CHUNCK_SIZE];
+  bool m_set_readable_on_fill[SECTOR_CHUNCK_SIZE];
+  bool m_set_byte_mask_on_fill;
   bool m_readable[SECTOR_CHUNCK_SIZE];
+  mem_access_byte_mask_t m_dirty_byte_mask;
 
   unsigned get_sector_index(mem_access_sector_mask_t sector_mask) {
     assert(sector_mask.count() == 1);
@@ -463,6 +554,7 @@ class cache_config {
     m_data_port_width = 0;
     m_set_index_function = LINEAR_SET_FUNCTION;
     m_is_streaming = false;
+    m_wr_percent = 0;
   }
   void init(char *config, FuncCache status) {
     cache_status = status;
@@ -503,16 +595,6 @@ class cache_config {
       default:
         exit_parse_error();
     }
-    switch (rp) {
-      case 'L':
-        m_replacement_policy = LRU;
-        break;
-      case 'F':
-        m_replacement_policy = FIFO;
-        break;
-      default:
-        exit_parse_error();
-    }
     switch (wp) {
       case 'R':
         m_write_policy = READ_ONLY;
@@ -546,22 +628,27 @@ class cache_config {
         exit_parse_error();
     }
     if (m_alloc_policy == STREAMING) {
-      // For streaming cache, we set the alloc policy to be on-fill to remove
-      // all line_alloc_fail stalls we set the MSHRs to be equal to max
-      // allocated cache lines. This is possible by moving TAG to be shared
-      // between cache line and MSHR enrty (i.e. for each cache line, there is
-      // an MSHR rntey associated with it) This is the easiest think we can
-      // think about to model (mimic) L1 streaming cache in Pascal and Volta
-      // Based on our microbenchmakrs, MSHRs entries have been increasing
-      // substantially in Pascal and Volta For more information about streaming
-      // cache, see:
-      // http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
-      // https://ieeexplore.ieee.org/document/8344474/
+      /*
+      For streaming cache:
+      (1) we set the alloc policy to be on-fill to remove all line_alloc_fail
+      stalls. if the whole memory is allocated to the L1 cache, then make the
+      allocation to be on_MISS otherwise, make it ON_FILL to eliminate line
+      allocation fails. i.e. MSHR throughput is the same, independent on the L1
+      cache size/associativity So, we set the allocation policy per kernel
+      basis, see shader.cc, max_cta() function
+
+      (2) We also set the MSHRs to be equal to max
+      allocated cache lines. This is possible by moving TAG to be shared
+      between cache line and MSHR enrty (i.e. for each cache line, there is
+      an MSHR rntey associated with it). This is the easiest think we can
+      think of to model (mimic) L1 streaming cache in Pascal and Volta
+
+      For more information about streaming cache, see:
+      http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+      https://ieeexplore.ieee.org/document/8344474/
+      */
       m_is_streaming = true;
       m_alloc_policy = ON_FILL;
-      m_mshr_entries = m_nset * m_assoc * MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
-      if (m_cache_type == SECTOR) m_mshr_entries *= SECTOR_CHUNCK_SIZE;
-      m_mshr_max_merge = MAX_WARP_PER_SM;
     }
     switch (mshr_type) {
       case 'F':
@@ -610,7 +697,8 @@ class cache_config {
     }
 
     // detect invalid configuration
-    if (m_alloc_policy == ON_FILL and m_write_policy == WRITE_BACK) {
+    if ((m_alloc_policy == ON_FILL || m_alloc_policy == STREAMING) and
+        m_write_policy == WRITE_BACK) {
       // A writeback cache with allocate-on-fill policy will inevitably lead to
       // deadlock: The deadlock happens when an incoming cache-fill evicts a
       // dirty line, generating a writeback request.  If the memory subsystem is
@@ -656,6 +744,9 @@ class cache_config {
       case 'L':
         m_set_index_function = LINEAR_SET_FUNCTION;
         break;
+      case 'X':
+        m_set_index_function = BITWISE_XORING_FUNCTION;
+        break;
       default:
         exit_parse_error();
     }
@@ -675,11 +766,11 @@ class cache_config {
   }
   unsigned get_max_num_lines() const {
     assert(m_valid);
-    return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER * m_nset * original_m_assoc;
+    return get_max_cache_multiplier() * m_nset * original_m_assoc;
   }
   unsigned get_max_assoc() const {
     assert(m_valid);
-    return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER * original_m_assoc;
+    return get_max_cache_multiplier() * original_m_assoc;
   }
   void print(FILE *fp) const {
     fprintf(fp, "Size = %d B (%d Set x %d-way x %d byte line)\n",
@@ -688,6 +779,10 @@ class cache_config {
 
   virtual unsigned set_index(new_addr_type addr) const;
 
+  virtual unsigned get_max_cache_multiplier() const {
+    return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
+  }
+
   unsigned hash_function(new_addr_type addr, unsigned m_nset,
                          unsigned m_line_sz_log2, unsigned m_nset_log2,
                          unsigned m_index_function) const;
@@ -722,10 +817,18 @@ class cache_config {
   }
   bool is_streaming() { return m_is_streaming; }
   FuncCache get_cache_status() { return cache_status; }
+  void set_allocation_policy(enum allocation_policy_t alloc) {
+    m_alloc_policy = alloc;
+  }
   char *m_config_string;
   char *m_config_stringPrefL1;
   char *m_config_stringPrefShared;
   FuncCache cache_status;
+  unsigned m_wr_percent;
+  write_allocate_policy_t get_write_allocate_policy() {
+    return m_write_alloc_policy;
+  }
+  write_policy_t get_write_policy() { return m_write_policy; }
 
  protected:
   void exit_parse_error() {
@@ -782,6 +885,7 @@ class cache_config {
   friend class l1_cache;
   friend class l2_cache;
   friend class memory_sub_partition;
+  friend class mee;
 };
 
 class l1d_cache_config : public cache_config {
@@ -789,16 +893,28 @@ class l1d_cache_config : public cache_config {
   l1d_cache_config() : cache_config() {}
   unsigned set_bank(new_addr_type addr) const;
   void init(char *config, FuncCache status) {
-    m_banks_byte_interleaving_log2 = LOGB2(l1_banks_byte_interleaving);
-    m_l1_banks_log2 = LOGB2(l1_banks);
+    l1_banks_byte_interleaving_log2 = LOGB2(l1_banks_byte_interleaving);
+    l1_banks_log2 = LOGB2(l1_banks);
     cache_config::init(config, status);
   }
   unsigned l1_latency;
   unsigned l1_banks;
-  unsigned m_l1_banks_log2;
+  unsigned l1_banks_log2;
   unsigned l1_banks_byte_interleaving;
-  unsigned m_banks_byte_interleaving_log2;
+  unsigned l1_banks_byte_interleaving_log2;
   unsigned l1_banks_hashing_function;
+  unsigned m_unified_cache_size;
+  virtual unsigned get_max_cache_multiplier() const {
+    // set * assoc * cacheline size. Then convert Byte to KB
+    // gpgpu_unified_cache_size is in KB while original_sz is in B
+    if (m_unified_cache_size > 0) {
+      unsigned original_size = m_nset * original_m_assoc * m_line_sz / 1024;
+      assert(m_unified_cache_size % original_size == 0);
+      return m_unified_cache_size / original_size;
+    } else {
+      return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
+    }
+  }
 };
 
 class l2_cache_config : public cache_config {
@@ -818,20 +934,22 @@ class tag_array {
   ~tag_array();
 
   enum cache_request_status probe(new_addr_type addr, unsigned &idx,
-                                  mem_fetch *mf, bool probe_mode = false) const;
+                                  mem_fetch *mf, bool is_write,
+                                  bool probe_mode = false) const;
   enum cache_request_status probe(new_addr_type addr, unsigned &idx,
-                                  mem_access_sector_mask_t mask,
+                                  mem_access_sector_mask_t mask, bool is_write,
                                   bool probe_mode = false,
                                   mem_fetch *mf = NULL) const;
   enum cache_request_status access(new_addr_type addr, unsigned time,
                                    unsigned &idx, mem_fetch *mf);
   enum cache_request_status access(new_addr_type addr, unsigned time,
                                    unsigned &idx, bool &wb,
-                                   evicted_block_info &evicted, mem_fetch *mf);
+                                   evicted_block_info &evicted, mem_fetch *mf, bool mshr_hit_avail);
 
-  void fill(new_addr_type addr, unsigned time, mem_fetch *mf);
+  void fill(new_addr_type addr, unsigned time, mem_fetch *mf, bool is_write);
   void fill(unsigned idx, unsigned time, mem_fetch *mf);
-  void fill(new_addr_type addr, unsigned time, mem_access_sector_mask_t mask);
+  void fill(new_addr_type addr, unsigned time, mem_access_sector_mask_t mask,
+            mem_access_byte_mask_t byte_mask, bool is_write);
 
   unsigned size() const { return m_config.get_num_lines(); }
   cache_block_t *get_block(unsigned idx) { return m_lines[idx]; }
@@ -849,6 +967,7 @@ class tag_array {
   void update_cache_parameters(cache_config &config);
   void add_pending_line(mem_fetch *mf);
   void remove_pending_line(mem_fetch *mf);
+  void inc_dirty() { m_dirty++; }
 
  protected:
   // This constructor is intended for use only from derived classes that wish to
@@ -869,6 +988,7 @@ class tag_array {
                            // allocated but not filled
   unsigned m_res_fail;
   unsigned m_sector_miss;
+  unsigned m_dirty;
 
   // performance counters for calculating the amount of misses within a time
   // window
@@ -883,6 +1003,10 @@ class tag_array {
 
   typedef tr1_hash_map<new_addr_type, unsigned> line_table;
   line_table pending_lines;
+
+  friend class baseline_cache;
+  friend class l2_cache;
+  friend class mee;
 };
 
 class mshr_table {
@@ -1110,6 +1234,8 @@ class cache_stats {
   unsigned long long m_cache_port_available_cycles;
   unsigned long long m_cache_data_port_busy_cycles;
   unsigned long long m_cache_fill_port_busy_cycles;
+
+  friend class gpgpu_sim;
 };
 
 class cache_t {
@@ -1214,7 +1340,8 @@ class baseline_cache : public cache_t {
   // something is read or written without doing anything else.
   void force_tag_access(new_addr_type addr, unsigned time,
                         mem_access_sector_mask_t mask) {
-    m_tag_array->fill(addr, time, mask);
+    mem_access_byte_mask_t byte_mask;
+    m_tag_array->fill(addr, time, mask, byte_mask, true);
   }
 
  protected:
@@ -1316,6 +1443,10 @@ class baseline_cache : public cache_t {
   };
 
   bandwidth_management m_bandwidth_management;
+
+  friend class l2_cache;
+  friend class data_cache;
+  friend class mee;
 };
 
 /// Read only cache
@@ -1443,6 +1574,7 @@ class data_cache : public baseline_cache {
                                               unsigned cache_index,
                                               mem_fetch *mf, unsigned time,
                                               std::list<cache_event> &events);
+  enum cache_request_status probe(new_addr_type addr, mem_fetch *mf) const;
 
  protected:
   mem_fetch_allocator *m_memfetch_creator;
@@ -1451,7 +1583,7 @@ class data_cache : public baseline_cache {
   /// Sends write request to lower level memory (write or writeback)
   void send_write_request(mem_fetch *mf, cache_event request, unsigned time,
                           std::list<cache_event> &events);
-
+  void update_m_readable(mem_fetch *mf, unsigned cache_index);
   // Member Function pointers - Set by configuration options
   // to the functions below each grouping
   /******* Write-hit configs *******/
@@ -1572,6 +1704,23 @@ class l2_cache : public data_cache {
   virtual enum cache_request_status access(new_addr_type addr, mem_fetch *mf,
                                            unsigned time,
                                            std::list<cache_event> &events);
+  virtual enum cache_request_status probe(new_addr_type addr, mem_fetch *mf) const;
+};
+
+class meta_cache : public data_cache {
+ public:
+  meta_cache(const char *name, cache_config &config, int core_id, int type_id,
+           mem_fetch_interface *memport, mem_fetch_allocator *mfcreator,
+           enum mem_fetch_status status, class gpgpu_sim *gpu)
+      : data_cache(name, config, core_id, type_id, memport, mfcreator, status,
+                   META_WR_ALLOC_R, META_WRBK_ACC, gpu) {}
+
+  virtual ~meta_cache() {}
+
+  virtual enum cache_request_status access(new_addr_type addr, mem_fetch *mf,
+                                           unsigned time,
+                                           std::list<cache_event> &events);
+  virtual enum cache_request_status probe(new_addr_type addr, mem_fetch *mf) const;
 };
 
 /*****************************************************************************/
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 165068879..4321a55c7 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, George L. Yuan,
-// Ali Bakhoda, Andrew Turner, Ivan Sham
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, George L. Yuan,
+// Ali Bakhoda, Andrew Turner, Ivan Sham, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -95,10 +96,11 @@ tr1_hash_map<new_addr_type, unsigned> address_random_interleaving;
 
 #include "mem_latency_stat.h"
 
+
 void power_config::reg_options(class OptionParser *opp) {
-  option_parser_register(opp, "-gpuwattch_xml_file", OPT_CSTR,
-                         &g_power_config_name, "GPUWattch XML file",
-                         "gpuwattch.xml");
+  option_parser_register(opp, "-accelwattch_xml_file", OPT_CSTR,
+                         &g_power_config_name, "AccelWattch XML file",
+                         "accelwattch_sass_sim.xml");
 
   option_parser_register(opp, "-power_simulation_enabled", OPT_BOOL,
                          &g_power_simulation_enabled,
@@ -108,6 +110,92 @@ void power_config::reg_options(class OptionParser *opp) {
                          &g_power_per_cycle_dump,
                          "Dump detailed power output each cycle", "0");
 
+
+
+
+  option_parser_register(opp, "-hw_perf_file_name", OPT_CSTR,
+                         &g_hw_perf_file_name, "Hardware Performance Statistics file",
+                         "hw_perf.csv");
+
+  option_parser_register(opp, "-hw_perf_bench_name", OPT_CSTR,
+                         &g_hw_perf_bench_name, "Kernel Name in Hardware Performance Statistics file",
+                         "");
+
+  option_parser_register(opp, "-power_simulation_mode", OPT_INT32,
+                         &g_power_simulation_mode,
+                         "Switch performance counter input for power simulation (0=Sim, 1=HW, 2=HW-Sim Hybrid)", "0");
+
+  option_parser_register(opp, "-dvfs_enabled", OPT_BOOL,
+                         &g_dvfs_enabled,
+                         "Turn on DVFS for power model", "0");
+  option_parser_register(opp, "-aggregate_power_stats", OPT_BOOL,
+                         &g_aggregate_power_stats,
+                         "Accumulate power across all kernels", "0");
+
+  //Accelwattch Hyrbid Configuration
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L1_RH", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L1_RH],
+                         "Get L1 Read Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L1_RM", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L1_RM],
+                         "Get L1 Read Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L1_WH", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L1_WH],
+                         "Get L1 Write Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L1_WM", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L1_WM],
+                         "Get L1 Write Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L2_RH", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L2_RH],
+                         "Get L2 Read Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L2_RM", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L2_RM],
+                         "Get L2 Read Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L2_WH", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L2_WH],
+                         "Get L2 Write Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L2_WM", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L2_WM],
+                         "Get L2 Write Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_CC_ACC", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_CC_ACC],
+                         "Get Constant Cache Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_SHARED_ACC", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_SHRD_ACC],
+                         "Get Shared Memory Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_DRAM_RD", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_DRAM_RD],
+                         "Get DRAM Reads for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_DRAM_WR", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_DRAM_WR],
+                         "Get DRAM Writes for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_NOC", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_NOC],
+                         "Get Interconnect Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_PIPE_DUTY", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_PIPE_DUTY],
+                         "Get Pipeline Duty Cycle Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_NUM_SM_IDLE", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_NUM_SM_IDLE],
+                         "Get Number of Idle SMs for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_CYCLES", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_CYCLES],
+                         "Get Executed Cycles for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_VOLTAGE", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_VOLTAGE],
+                         "Get Chip Voltage for Accelwattch-Hybrid from Accel-Sim", "0");
+
+
   // Output Data Formats
   option_parser_register(
       opp, "-power_trace_enabled", OPT_BOOL, &g_power_trace_enabled,
@@ -129,6 +217,9 @@ void power_config::reg_options(class OptionParser *opp) {
 }
 
 void memory_config::reg_options(class OptionParser *opp) {
+  option_parser_register(opp, "-gpgpu_crypto_latency", OPT_INT32,
+                         &m_crypto_latency, "gpgpu secmem crypto latency",
+                         "40");
   option_parser_register(opp, "-gpgpu_perf_sim_memcpy", OPT_BOOL,
                          &m_perf_sim_memcpy, "Fill the L2 cache on memcpy",
                          "1");
@@ -149,6 +240,12 @@ void memory_config::reg_options(class OptionParser *opp) {
                          " {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_"
                          "alloc>,<mshr>:<N>:<merge>,<mq>}",
                          "64:128:8,L:B:m:N,A:16:4,4");
+  option_parser_register(opp, "-gpgpu_cache:dmeta", OPT_CSTR,
+                         &m_META_config.m_config_string,
+                         "unified banked META data cache config "
+                         " {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_"
+                         "alloc>,<mshr>:<N>:<merge>,<mq>}",
+                         "64:128:8,L:B:m:N,A:16:4,4");
   option_parser_register(opp, "-gpgpu_cache:dl2_texture_only", OPT_BOOL,
                          &m_L2_texure_only, "L2 cache used for texture only",
                          "1");
@@ -249,6 +346,8 @@ void shader_core_config::reg_options(class OptionParser *opp) {
                          " {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_"
                          "alloc>,<mshr>:<N>:<merge>,<mq> | none}",
                          "none");
+  option_parser_register(opp, "-gpgpu_l1_cache_write_ratio", OPT_UINT32,
+                         &m_L1D_config.m_wr_percent, "L1D write ratio", "0");
   option_parser_register(opp, "-gpgpu_l1_banks", OPT_UINT32,
                          &m_L1D_config.l1_banks, "The number of L1 cache banks",
                          "1");
@@ -326,7 +425,14 @@ void shader_core_config::reg_options(class OptionParser *opp) {
   option_parser_register(
       opp, "-gpgpu_shmem_size", OPT_UINT32, &gpgpu_shmem_size,
       "Size of shared memory per shader core (default 16kB)", "16384");
-  option_parser_register(opp, "-gpgpu_adaptive_cache_config", OPT_UINT32,
+  option_parser_register(opp, "-gpgpu_shmem_option", OPT_CSTR,
+                         &gpgpu_shmem_option,
+                         "Option list of shared memory sizes", "0");
+  option_parser_register(
+      opp, "-gpgpu_unified_l1d_size", OPT_UINT32,
+      &m_L1D_config.m_unified_cache_size,
+      "Size of unified data cache(L1D + shared memory) in KB", "0");
+  option_parser_register(opp, "-gpgpu_adaptive_cache_config", OPT_BOOL,
                          &adaptive_cache_config, "adaptive_cache_config", "0");
   option_parser_register(
       opp, "-gpgpu_shmem_sizeDefault", OPT_UINT32, &gpgpu_shmem_sizeDefault,
@@ -826,7 +932,7 @@ gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
 
 #ifdef GPGPUSIM_POWER_MODEL
   m_gpgpusim_wrapper = new gpgpu_sim_wrapper(config.g_power_simulation_enabled,
-                                             config.g_power_config_name);
+                                             config.g_power_config_name, config.g_power_simulation_mode, config.g_dvfs_enabled);
 #endif
 
   m_shader_stats = new shader_core_stats(m_shader_config);
@@ -1001,6 +1107,14 @@ void gpgpu_sim::init() {
   partiton_reqs_in_parallel_util = 0;
   gpu_sim_cycle_parition_util = 0;
 
+// McPAT initialization function. Called on first launch of GPU
+#ifdef GPGPUSIM_POWER_MODEL
+  if (m_config.g_power_simulation_enabled) {
+    init_mcpat(m_config, m_gpgpusim_wrapper, m_config.gpu_stat_sample_freq,
+               gpu_tot_sim_insn, gpu_sim_insn);
+  }
+#endif
+
   reinit_clock_domains();
   gpgpu_ctx->func_sim->set_param_gpgpu_num_shaders(m_config.num_shader());
   for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++)
@@ -1026,14 +1140,6 @@ void gpgpu_sim::init() {
   }
 
   if (g_network_mode) icnt_init();
-
-    // McPAT initialization function. Called on first launch of GPU
-#ifdef GPGPUSIM_POWER_MODEL
-  if (m_config.g_power_simulation_enabled) {
-    init_mcpat(m_config, m_gpgpusim_wrapper, m_config.gpu_stat_sample_freq,
-               gpu_tot_sim_insn, gpu_sim_insn);
-  }
-#endif
 }
 
 void gpgpu_sim::update_stats() {
@@ -1058,6 +1164,11 @@ void gpgpu_sim::update_stats() {
   gpu_occupancy = occupancy_stats();
 }
 
+PowerscalingCoefficients *gpgpu_sim::get_scaling_coeffs()
+{
+  return m_gpgpusim_wrapper->get_scaling_coeffs();
+}
+
 void gpgpu_sim::print_stats() {
   gpgpu_ctx->stats->ptx_file_line_stats_write_file();
   gpu_print_stat();
@@ -1137,6 +1248,18 @@ std::string gpgpu_sim::executed_kernel_info_string() {
 
   return statout.str();
 }
+
+std::string gpgpu_sim::executed_kernel_name() {
+  std::stringstream statout;  
+  if( m_executed_kernel_names.size() == 1)
+     statout << m_executed_kernel_names[0];
+  else{
+    for (unsigned int k = 0; k < m_executed_kernel_names.size(); k++) {
+      statout << m_executed_kernel_names[k] << " ";
+    }
+  }
+  return statout.str();
+}
 void gpgpu_sim::set_cache_config(std::string kernel_name,
                                  FuncCache cacheConfig) {
   m_special_cache_config[kernel_name] = cacheConfig;
@@ -1231,6 +1354,116 @@ void gpgpu_sim::clear_executed_kernel_info() {
   m_executed_kernel_names.clear();
   m_executed_kernel_uids.clear();
 }
+
+void gpgpu_sim::gpu_print_METACache_stat(char META[]) {
+  if (!m_memory_config->m_META_config.disabled()) {
+    cache_stats l2_stats;
+    struct cache_sub_stats l2_css;
+    struct cache_sub_stats total_l2_css;
+    l2_stats.clear();
+    l2_css.clear();
+    total_l2_css.clear();
+
+    printf("\n========= %s cache stats =========\n", META);
+    
+    for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
+      m_memory_partition_unit[i]->accumulate_METAcache_stats(l2_stats, META);
+      m_memory_partition_unit[i]->get_METAcache_sub_stats(l2_css, META);
+
+      fprintf(stdout,
+              "%s_cache_bank[%d]: Access = %llu, Miss = %llu, Miss_rate = "
+              "%.3lf, Pending_hits = %llu, Reservation_fails = %llu\n",
+              META, i, l2_css.accesses, l2_css.misses,
+              (double)l2_css.misses / (double)l2_css.accesses,
+              l2_css.pending_hits, l2_css.res_fails);
+
+      total_l2_css += l2_css;
+    }
+    
+    if (!m_memory_config->m_META_config.disabled() &&
+        m_memory_config->m_META_config.get_num_lines()) {
+      // L2c_print_cache_stat();
+      printf("%s_total_cache_accesses = %llu\n", META, total_l2_css.accesses);
+      printf("%s_total_cache_misses = %llu\n", META, total_l2_css.misses);
+      if (total_l2_css.accesses > 0)
+        printf("%s_total_cache_miss_rate = %.4lf\n",
+               META, (double)total_l2_css.misses / (double)total_l2_css.accesses);
+      //secondary MISS
+      printf("%s_total_cache_secondary_misses = %llu\n", META, l2_stats.m_stats[META_ACC][MSHR_HIT]);
+      //secondary MISS rate
+      if (total_l2_css.misses > 0)
+        printf("%s_total_cache_secondary_miss_rate = %.4lf\n", META, (double)l2_stats.m_stats[META_ACC][MSHR_HIT] / ((double)total_l2_css.misses + (double)l2_stats.m_stats[META_ACC][MSHR_HIT]));
+      printf("%s_total_cache_pending_hits = %llu\n", META, total_l2_css.pending_hits);
+      printf("%s_total_cache_reservation_fails = %llu\n",
+             META, total_l2_css.res_fails);
+      printf("%s_total_cache_breakdown:\n", META);
+
+      char META_cache_stats_breakdown[128];
+      strcpy(META_cache_stats_breakdown, META);
+      strcat(META_cache_stats_breakdown, "_cache_stats_breakdown");
+      l2_stats.print_stats(stdout, META_cache_stats_breakdown);
+      
+      printf("%s_total_cache_reservation_fail_breakdown:\n", META);
+      
+      char META_cache_stats_fail_breakdown[128];
+      strcpy(META_cache_stats_fail_breakdown, META);
+      strcat(META_cache_stats_fail_breakdown, "_cache_stats_fail_breakdown");
+      l2_stats.print_fail_stats(stdout, META_cache_stats_fail_breakdown);
+
+      char META_cache[128];
+      strcpy(META_cache, META);
+      strcat(META_cache, "_cache");
+      total_l2_css.print_port_stats(stdout, META_cache);
+    }
+  }
+}
+
+void gpgpu_sim::gpu_print_METACache_data_type_breakdown() {
+
+  printf("\n========= meta cache data type breakdown =========\n");
+  
+  unsigned long long m_cache_tot_NORM_acc = 0;
+  unsigned long long m_cache_tot_CTR_acc = 0;
+  unsigned long long m_cache_tot_MAC_acc = 0;
+  unsigned long long m_cache_tot_BMT_acc = 0;
+  unsigned long long m_cache_tot_meta_wb = 0;
+  
+  for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
+    m_cache_tot_NORM_acc += m_memory_partition_unit[i]->m_cache_NORM_acc;
+    m_cache_tot_CTR_acc += m_memory_partition_unit[i]->m_cache_CTR_acc;
+    m_cache_tot_MAC_acc += m_memory_partition_unit[i]->m_cache_MAC_acc;
+    m_cache_tot_BMT_acc += m_memory_partition_unit[i]->m_cache_BMT_acc;
+    m_cache_tot_meta_wb += m_memory_partition_unit[i]->m_cache_meta_wb;
+  }
+
+  printf("m_cache_tot_NORM_acc = %lld\n", m_cache_tot_NORM_acc);
+  printf("m_cache_tot_CTR_acc = %lld\n", m_cache_tot_CTR_acc);
+  printf("m_cache_tot_MAC_acc = %lld\n", m_cache_tot_MAC_acc);
+  printf("m_cache_tot_BMT_acc = %lld\n", m_cache_tot_BMT_acc);
+  printf("m_cache_tot_meta_wb = %lld\n", m_cache_tot_meta_wb);
+
+}
+void gpgpu_sim::gpu_print_ctrModCount_breakdown() {
+  printf("\n========= ctr modification Count breakdown =========\n");
+
+  int ctrModificationCountBreakdown[20];
+  memset(ctrModificationCountBreakdown, 0, sizeof(ctrModificationCountBreakdown));
+  counterMap *m_count;
+  counterMap::iterator it;
+
+  for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
+    m_count = m_memory_partition_unit[i]->get_ctrModificationCount();
+    
+    for (it = m_count->begin(); it != m_count->end(); it++) {
+      ctrModificationCountBreakdown[max(0, (int)floor(log2(it->second)))]++;// - 6
+    }
+  }
+
+  for (int i = 0; i < 10; i++) {
+    printf("ctrModificationCountBreakdown[%d] = %d\n", 1 << (i), ctrModificationCountBreakdown[i]); // + 7
+  }
+}
+
 void gpgpu_sim::gpu_print_stat() {
   FILE *statfout = stdout;
 
@@ -1317,10 +1550,20 @@ void gpgpu_sim::gpu_print_stat() {
   m_shader_stats->print(stdout);
 #ifdef GPGPUSIM_POWER_MODEL
   if (m_config.g_power_simulation_enabled) {
+    if(m_config.g_power_simulation_mode > 0){
+        //if(!m_config.g_aggregate_power_stats)
+          mcpat_reset_perf_count(m_gpgpusim_wrapper);
+        calculate_hw_mcpat(m_config, getShaderCoreConfig(), m_gpgpusim_wrapper,
+                  m_power_stats, m_config.gpu_stat_sample_freq,
+                  gpu_tot_sim_cycle, gpu_sim_cycle, gpu_tot_sim_insn,
+                  gpu_sim_insn, m_config.g_power_simulation_mode, m_config.g_dvfs_enabled, 
+                  m_config.g_hw_perf_file_name, m_config.g_hw_perf_bench_name, executed_kernel_name(), m_config.accelwattch_hybrid_configuration, m_config.g_aggregate_power_stats);
+    }
     m_gpgpusim_wrapper->print_power_kernel_stats(
         gpu_sim_cycle, gpu_tot_sim_cycle, gpu_tot_sim_insn + gpu_sim_insn,
         kernel_info_str, true);
-    mcpat_reset_perf_count(m_gpgpusim_wrapper);
+    //if(!m_config.g_aggregate_power_stats)
+      mcpat_reset_perf_count(m_gpgpusim_wrapper);
   }
 #endif
 
@@ -1371,6 +1614,16 @@ void gpgpu_sim::gpu_print_stat() {
       total_l2_css.print_port_stats(stdout, "L2_cache");
     }
   }
+  // CTR cache stats
+  gpu_print_METACache_stat("CTR");
+  // MAC cache stats
+  gpu_print_METACache_stat("MAC");
+  // BMT cache stats
+  gpu_print_METACache_stat("BMT");
+  
+  // mf data type breakdown
+  gpu_print_METACache_data_type_breakdown();
+  gpu_print_ctrModCount_breakdown();
 
   if (m_config.gpgpu_cflog_interval != 0) {
     spill_log_to_file(stdout, 1, gpu_sim_cycle);
@@ -1787,6 +2040,7 @@ void gpgpu_sim::cycle() {
           m_power_stats->pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i],
           m_power_stats->pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i],
           m_power_stats->pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i],
+          m_power_stats->pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i],
           m_power_stats->pwr_mem_stat->n_req[CURRENT_STAT_IDX][i]);
     }
   }
@@ -1807,6 +2061,8 @@ void gpgpu_sim::cycle() {
         m_memory_sub_partition[i]->push(mf, gpu_sim_cycle + gpu_tot_sim_cycle);
         if (mf) partiton_reqs_in_parallel_per_cycle++;
       }
+      if (i & 1)
+        m_memory_partition_unit[i >> 1]->cache_cycle(gpu_sim_cycle + gpu_tot_sim_cycle);
       m_memory_sub_partition[i]->cache_cycle(gpu_sim_cycle + gpu_tot_sim_cycle);
       m_memory_sub_partition[i]->accumulate_L2cache_stats(
           m_power_stats->pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX]);
@@ -1830,7 +2086,7 @@ void gpgpu_sim::cycle() {
         m_cluster[i]->core_cycle();
         *active_sms += m_cluster[i]->get_n_active_sms();
       }
-      // Update core icnt/cache stats for GPUWattch
+      // Update core icnt/cache stats for AccelWattch
       m_cluster[i]->get_icnt_stats(
           m_power_stats->pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i],
           m_power_stats->pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i]);
@@ -1860,10 +2116,12 @@ void gpgpu_sim::cycle() {
       // McPAT main cycle (interface with McPAT)
 #ifdef GPGPUSIM_POWER_MODEL
     if (m_config.g_power_simulation_enabled) {
+      if(m_config.g_power_simulation_mode == 0){
       mcpat_cycle(m_config, getShaderCoreConfig(), m_gpgpusim_wrapper,
                   m_power_stats, m_config.gpu_stat_sample_freq,
                   gpu_tot_sim_cycle, gpu_sim_cycle, gpu_tot_sim_insn,
-                  gpu_sim_insn);
+                  gpu_sim_insn, m_config.g_dvfs_enabled);
+      }
     }
 #endif
 
@@ -1897,7 +2155,7 @@ void gpgpu_sim::cycle() {
         if (m_memory_config->m_L2_config.get_num_lines()) {
           int dlc = 0;
           for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
-            dlc = m_memory_sub_partition[i]->flushL2();
+            dlc = m_memory_sub_partition[i]->flushL2();//TODO
             assert(dlc == 0);  // TODO: need to model actual writes to DRAM here
             printf("Dirty lines flushed from L2 %d is %d\n", i, dlc);
           }
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index 2e6820d82..d2784b98f 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -26,6 +27,7 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 
+
 #ifndef GPU_SIM_H
 #define GPU_SIM_H
 
@@ -68,6 +70,29 @@ extern tr1_hash_map<new_addr_type, unsigned> address_random_interleaving;
 
 enum dram_ctrl_t { DRAM_FIFO = 0, DRAM_FRFCFS = 1 };
 
+enum hw_perf_t {
+  HW_BENCH_NAME=0,
+  HW_KERNEL_NAME,
+  HW_L1_RH,
+  HW_L1_RM,
+  HW_L1_WH,
+  HW_L1_WM,
+  HW_CC_ACC,
+  HW_SHRD_ACC,
+  HW_DRAM_RD,
+  HW_DRAM_WR,
+  HW_L2_RH,
+  HW_L2_RM,
+  HW_L2_WH,
+  HW_L2_WM,
+  HW_NOC,
+  HW_PIPE_DUTY,
+  HW_NUM_SM_IDLE,
+  HW_CYCLES,
+  HW_VOLTAGE,
+  HW_TOTAL_STATS
+};
+
 struct power_config {
   power_config() { m_valid = true; }
   void init() {
@@ -82,7 +107,8 @@ struct power_config {
       s++;
     }
     char buf1[1024];
-    snprintf(buf1, 1024, "gpgpusim_power_report__%s.log", date);
+    //snprintf(buf1, 1024, "accelwattch_power_report__%s.log", date);
+    snprintf(buf1, 1024, "accelwattch_power_report.log");
     g_power_filename = strdup(buf1);
     char buf2[1024];
     snprintf(buf2, 1024, "gpgpusim_power_trace_report__%s.log.gz", date);
@@ -94,6 +120,9 @@ struct power_config {
     snprintf(buf4, 1024, "gpgpusim_steady_state_tracking_report__%s.log.gz",
              date);
     g_steady_state_tracking_filename = strdup(buf4);
+    // for(int i =0; i< hw_perf_t::HW_TOTAL_STATS; i++){
+    //   accelwattch_hybrid_configuration[i] = 0;
+    // }
 
     if (g_steady_power_levels_enabled) {
       sscanf(gpu_steady_state_definition, "%lf:%lf",
@@ -125,6 +154,14 @@ struct power_config {
   double gpu_steady_power_deviation;
   double gpu_steady_min_period;
 
+
+  char *g_hw_perf_file_name;
+  char *g_hw_perf_bench_name;
+  int g_power_simulation_mode;
+  bool g_dvfs_enabled;
+  bool g_aggregate_power_stats;
+  bool accelwattch_hybrid_configuration[hw_perf_t::HW_TOTAL_STATS];
+
   // Nonlinear power model
   bool g_use_nonlinear_model;
   char *gpu_nonlinear_model_config;
@@ -228,6 +265,7 @@ class memory_config {
 
     m_address_mapping.init(m_n_mem, m_n_sub_partition_per_memory_channel);
     m_L2_config.init(&m_address_mapping);
+    m_META_config.init(&m_address_mapping);
 
     m_valid = true;
 
@@ -239,6 +277,7 @@ class memory_config {
 
   bool m_valid;
   mutable l2_cache_config m_L2_config;
+  mutable l2_cache_config m_META_config;
   bool m_L2_texure_only;
 
   char *gpgpu_dram_timing_opt;
@@ -312,6 +351,8 @@ class memory_config {
   unsigned gpgpu_frfcfs_dram_write_queue_size;
   unsigned write_high_watermark;
   unsigned write_low_watermark;
+  unsigned m_AES_Engines;
+  unsigned m_crypto_latency;
   bool m_perf_sim_memcpy;
   bool simple_dram_model;
 
@@ -357,7 +398,7 @@ class gpgpu_sim_config : public power_config,
 
     m_valid = true;
   }
-
+  unsigned get_core_freq() const { return core_freq; }
   unsigned num_shader() const { return m_shader_config.num_shader(); }
   unsigned num_cluster() const { return m_shader_config.n_simt_clusters; }
   unsigned get_max_concurrent_kernel() const { return max_concurrent_kernel; }
@@ -527,9 +568,13 @@ class gpgpu_sim : public gpgpu_t {
   bool kernel_more_cta_left(kernel_info_t *kernel) const;
   bool hit_max_cta_count() const;
   kernel_info_t *select_kernel();
+  PowerscalingCoefficients *get_scaling_coeffs();
   void decrement_kernel_latency();
 
   const gpgpu_sim_config &get_config() const { return m_config; }
+  void gpu_print_METACache_stat(char META[]);
+  void gpu_print_METACache_data_type_breakdown();
+  void gpu_print_ctrModCount_breakdown();
   void gpu_print_stat();
   void dump_pipeline(int mask, int s, int m) const;
 
@@ -634,6 +679,7 @@ class gpgpu_sim : public gpgpu_t {
 
   std::string executed_kernel_info_string();  //< format the kernel information
                                               // into a string for stat printout
+  std::string executed_kernel_name();
   void clear_executed_kernel_info();  //< clear the kernel information after
                                       // stat printout
   virtual void createSIMTCluster() = 0;
@@ -683,6 +729,9 @@ class gpgpu_sim : public gpgpu_t {
     m_functional_sim = false;
     m_functional_sim_kernel = NULL;
   }
+
+  typedef std::map<unsigned, short> Count;
+  
 };
 
 class exec_gpgpu_sim : public gpgpu_sim {
diff --git a/src/gpgpu-sim/hashing.cc b/src/gpgpu-sim/hashing.cc
index f566aa471..514b46a6e 100644
--- a/src/gpgpu-sim/hashing.cc
+++ b/src/gpgpu-sim/hashing.cc
@@ -35,7 +35,41 @@ unsigned ipoly_hash_function(new_addr_type higher_bits, unsigned index,
    * exit in GPGPU applications and also show good performance for other
    * strides.
    */
-  if (bank_set_num == 16) {
+  if (bank_set_num == 2) {
+    std::bitset<64> a(higher_bits);
+    std::bitset<1> b(index);
+    std::bitset<1> new_index(index);
+
+    new_index[0] = a[11] ^ a[9] ^ a[5] ^ a[4] ^ a[3] ^ a[2] ^ a[0] ^ b[0];
+
+    return new_index.to_ulong();
+
+  } if (bank_set_num == 4) {
+    std::bitset<64> a(higher_bits);
+    std::bitset<2> b(index);
+    std::bitset<2> new_index(index);
+
+    new_index[0] =
+        a[10] ^ a[9] ^ a[7] ^ a[6] ^ a[4] ^ a[3] ^ a[1] ^ a[0] ^ b[0];
+    new_index[1] =
+        a[9] ^ a[8] ^ a[6] ^ a[5] ^ a[3] ^ a[2] ^ a[1] ^ a[0] ^ b[1];
+
+    return new_index.to_ulong();
+
+  } if (bank_set_num == 8) {
+    std::bitset<64> a(higher_bits);
+    std::bitset<3> b(index);
+    std::bitset<3> new_index(index);
+
+    new_index[0] =
+        a[11] ^ a[10] ^ a[9] ^ a[7] ^ a[4] ^ a[3] ^ a[2] ^ a[0] ^ b[0];
+    new_index[1] =
+        a[12] ^ a[9] ^ a[8] ^ a[7] ^ a[5] ^ a[2] ^ a[1] ^ a[0] ^ b[1];
+    new_index[2] = a[10] ^ a[9] ^ a[8] ^ a[6] ^ a[3] ^ a[2] ^ a[1] ^ b[2];
+
+    return new_index.to_ulong();
+
+  } if (bank_set_num == 16) {
     std::bitset<64> a(higher_bits);
     std::bitset<4> b(index);
     std::bitset<4> new_index(index);
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index ab6e5c228..e3a41d5eb 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -37,10 +38,10 @@
 #include "../option_parser.h"
 #include "../statwrapper.h"
 #include "dram.h"
-#include "gpu-cache.h"
 #include "gpu-sim.h"
 #include "histogram.h"
 #include "l2cache.h"
+#include "mee.h"
 #include "l2cache_trace.h"
 #include "mem_fetch.h"
 #include "mem_latency_stat.h"
@@ -57,6 +58,19 @@ mem_fetch *partition_mf_allocator::alloc(new_addr_type addr,
   return mf;
 }
 
+mem_fetch *partition_mf_allocator::alloc(
+    new_addr_type addr, mem_access_type type, const active_mask_t &active_mask,
+    const mem_access_byte_mask_t &byte_mask,
+    const mem_access_sector_mask_t &sector_mask, unsigned size, bool wr,
+    unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc,
+    mem_fetch *original_mf) const {
+  mem_access_t access(type, addr, size, wr, active_mask, byte_mask, sector_mask,
+                      m_memory_config->gpgpu_ctx);
+  mem_fetch *mf =
+      new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE,
+                    wid, sid, tpc, m_memory_config, cycle, original_mf);
+  return mf;
+}
 memory_partition_unit::memory_partition_unit(unsigned partition_id,
                                              const memory_config *config,
                                              class memory_stats_t *stats,
@@ -68,6 +82,46 @@ memory_partition_unit::memory_partition_unit(unsigned partition_id,
       m_gpu(gpu) {
   m_dram = new dram_t(m_id, m_config, m_stats, this, gpu);
 
+  unsigned int icnt_L2;
+  unsigned int L2_dram;
+  unsigned int dram_L2;
+  unsigned int L2_icnt;
+  sscanf(m_config->gpgpu_L2_queue_config, "%u:%u:%u:%u", &icnt_L2, &L2_dram,
+         &dram_L2, &L2_icnt);
+  
+  m_mee_dram_queue[TOT] = new fifo_pipeline<mem_fetch>("mee-to-dram", 0, 1);
+  m_dram_mee_queue[TOT] = new fifo_pipeline<mem_fetch>("dram-to-mee", 0, 1);
+  for (unsigned i = 1; i < NUM_DATA_TYPE; i++) { 
+    m_mee_dram_queue[i] = new fifo_pipeline<mem_fetch>("mee-to-dram", 0, L2_dram);
+    m_dram_mee_queue[i] = new fifo_pipeline<mem_fetch>("dram-to-mee", 0, 128);
+  }
+
+  char CTRc_name[32];
+  char MACc_name[32];
+  char BMTc_name[32];
+  snprintf(CTRc_name, 32, "CTR_bank_%03d\0", m_id);
+  snprintf(MACc_name, 32, "MAC_bank_%03d\0", m_id);
+  snprintf(BMTc_name, 32, "BMT_bank_%03d\0", m_id);
+  // m_metainterface = new metainterface(this);
+  m_BMTinterface = new metainterface(m_mee_dram_queue[BMT]);
+  m_CTRinterface = new metainterface(m_mee_dram_queue[CTR]);
+  m_MACinterface = new metainterface(m_mee_dram_queue[MAC]);
+  m_mf_allocator = new partition_mf_allocator(config);
+
+  if (!m_config->m_META_config.disabled()) {
+    m_CTRcache =
+        new meta_cache(CTRc_name, m_config->m_META_config, -1, -1, m_CTRinterface,
+                     m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
+    m_MACcache =
+        new meta_cache(MACc_name, m_config->m_META_config, -1, -1, m_MACinterface,
+                     m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
+    m_BMTcache =
+        new meta_cache(BMTc_name, m_config->m_META_config, -1, -1, m_BMTinterface,
+                     m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
+  }
+  
+  m_mee = new mee(this, m_CTRcache, m_MACcache, m_BMTcache, m_config, m_gpu);
+
   m_sub_partition = new memory_sub_partition
       *[m_config->m_n_sub_partition_per_memory_channel];
   for (unsigned p = 0; p < m_config->m_n_sub_partition_per_memory_channel;
@@ -77,6 +131,11 @@ memory_partition_unit::memory_partition_unit(unsigned partition_id,
     m_sub_partition[p] =
         new memory_sub_partition(sub_partition_id, m_config, stats, gpu);
   }
+  m_cache_NORM_acc = 0;
+  m_cache_CTR_acc = 0;
+  m_cache_MAC_acc = 0;
+  m_cache_BMT_acc = 0;
+  m_cache_meta_wb = 0;
 }
 
 void memory_partition_unit::handle_memcpy_to_gpu(
@@ -94,6 +153,10 @@ void memory_partition_unit::handle_memcpy_to_gpu(
 
 memory_partition_unit::~memory_partition_unit() {
   delete m_dram;
+  delete m_CTRcache;
+  // delete m_metainterface;
+  delete m_BMTinterface;
+  delete m_mee;
   for (unsigned p = 0; p < m_config->m_n_sub_partition_per_memory_channel;
        p++) {
     delete m_sub_partition[p];
@@ -183,10 +246,12 @@ bool memory_partition_unit::busy() const {
 }
 
 void memory_partition_unit::cache_cycle(unsigned cycle) {
-  for (unsigned p = 0; p < m_config->m_n_sub_partition_per_memory_channel;
-       p++) {
-    m_sub_partition[p]->cache_cycle(cycle);
-  }
+  // for (unsigned p = 0; p < m_config->m_n_sub_partition_per_memory_channel;
+  //      p++) {
+  //   m_sub_partition[p]->cache_cycle(cycle);
+  // }
+  // printf("memory_partition_unit cycle: %d\n", cycle);
+  m_mee->simple_cycle(cycle);
 }
 
 void memory_partition_unit::visualizer_print(gzFile visualizer_file) const {
@@ -200,7 +265,7 @@ void memory_partition_unit::visualizer_print(gzFile visualizer_file) const {
 // determine whether a given subpartition can issue to DRAM
 bool memory_partition_unit::can_issue_to_dram(int inner_sub_partition_id) {
   int spid = inner_sub_partition_id;
-  bool sub_partition_contention = m_sub_partition[spid]->dram_L2_queue_full();
+  bool sub_partition_contention = dram_mee_queue_full();
   bool has_dram_resource = m_arbitration_metadata.has_credits(spid);
 
   MEMPART_DPRINTF(
@@ -217,6 +282,51 @@ int memory_partition_unit::global_sub_partition_id_to_local_id(
           m_id * m_config->m_n_sub_partition_per_memory_channel);
 }
 
+void memory_partition_unit::mee_to_dram_cycle() {
+  // mee to dram 队列满了就停止发送
+  if (m_mee_dram_queue[TOT]->full()) return;
+  //发送队列高于阈值优先发送
+  for (unsigned i = 1; i < NUM_DATA_TYPE; i++) { 
+    unsigned dtype = i;
+    if (m_mee_dram_queue[dtype]->get_n_element() >= send_trigger_threshold) {
+      if (m_n_mf[dtype] + m_dram_mee_queue[dtype]->get_n_element() >= receive_stop_threshold) continue;
+      m_mee_dram_queue[TOT]->push(m_mee_dram_queue[dtype]->top());
+      m_n_mf[dtype]++;
+      // if (get_mpid() == 14)
+      //   printf("mpid: %d m_n_mf[%d]=%d append %x acc_type: %d\n", get_mpid(), dtype, m_n_mf[dtype], m_mee_dram_queue[dtype]->top()->get_addr(), m_mee_dram_queue[dtype]->top()->get_access_type());
+      m_mee_dram_queue[dtype]->pop();
+      return;
+    }
+  }
+  //返回队列高于阈值停止发送
+  for (unsigned i = 0; i < NUM_DATA_TYPE; i++) {
+    unsigned dtype = (i + last_send + 1) % NUM_DATA_TYPE;
+    if (dtype == 0) continue;
+    if (m_mee_dram_queue[dtype]->empty()) continue;
+    if (m_n_mf[dtype] + m_dram_mee_queue[dtype]->get_n_element() >= receive_stop_threshold) continue;
+    m_mee_dram_queue[TOT]->push(m_mee_dram_queue[dtype]->top());
+    m_n_mf[dtype]++;
+    // if (get_mpid() == 14)
+    //   printf("mpid: %d m_n_mf[%d]=%d append %x acc_type: %d\n", get_mpid(), dtype, m_n_mf[dtype], m_mee_dram_queue[dtype]->top()->get_addr(), m_mee_dram_queue[dtype]->top()->get_access_type());
+    m_mee_dram_queue[dtype]->pop();
+    last_send = dtype;
+    return;
+  }
+}
+
+void memory_partition_unit::dram_to_mee_cycle() {
+  //L2_WRBK_ACC在DRAM中被Delete，不会发往mee
+  if (m_dram_mee_queue[TOT]->empty()) return;
+  mem_fetch *mf_return = m_dram_mee_queue[TOT]->top();
+  if (!m_dram_mee_queue[mf_return->get_data_type()]->full()) {
+    m_dram_mee_queue[mf_return->get_data_type()]->push(mf_return);
+    m_n_mf[mf_return->get_data_type()]--;
+    // if (get_mpid() == 14)
+    //   printf("mpid: %d m_n_mf[%d]=%d pop %x acc_type: %d\n", get_mpid(), mf_return->get_data_type(), m_n_mf[mf_return->get_data_type()], mf_return->get_addr(), mf_return->get_access_type());
+    m_dram_mee_queue[TOT]->pop();
+  }
+}
+
 void memory_partition_unit::simple_dram_model_cycle() {
   // pop completed memory request from dram and push it to dram-to-L2 queue
   // of the original sub partition
@@ -231,12 +341,12 @@ void memory_partition_unit::simple_dram_model_cycle() {
       unsigned dest_global_spid = mf_return->get_sub_partition_id();
       int dest_spid = global_sub_partition_id_to_local_id(dest_global_spid);
       assert(m_sub_partition[dest_spid]->get_id() == dest_global_spid);
-      if (!m_sub_partition[dest_spid]->dram_L2_queue_full()) {
+      if (!dram_mee_queue_full()) {
         if (mf_return->get_access_type() == L1_WRBK_ACC) {
           m_sub_partition[dest_spid]->set_done(mf_return);
           delete mf_return;
         } else {
-          m_sub_partition[dest_spid]->dram_L2_queue_push(mf_return);
+          dram_mee_queue_push(mf_return);
           mf_return->set_status(
               IN_PARTITION_DRAM_TO_L2_QUEUE,
               m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
@@ -264,12 +374,12 @@ void memory_partition_unit::simple_dram_model_cycle() {
        p++) {
     int spid = (p + last_issued_partition + 1) %
                m_config->m_n_sub_partition_per_memory_channel;
-    if (!m_sub_partition[spid]->L2_dram_queue_empty() &&
+    if (!m_sub_partition[spid]->mee_dram_queue_empty() &&
         can_issue_to_dram(spid)) {
-      mem_fetch *mf = m_sub_partition[spid]->L2_dram_queue_top();
+      mem_fetch *mf = m_sub_partition[spid]->mee_dram_queue_top();
       if (m_dram->full(mf->is_write())) break;
 
-      m_sub_partition[spid]->L2_dram_queue_pop();
+      m_sub_partition[spid]->mee_dram_queue_pop();
       MEMPART_DPRINTF(
           "Issue mem_fetch request %p from sub partition %d to dram\n", mf,
           spid);
@@ -295,12 +405,12 @@ void memory_partition_unit::dram_cycle() {
     unsigned dest_global_spid = mf_return->get_sub_partition_id();
     int dest_spid = global_sub_partition_id_to_local_id(dest_global_spid);
     assert(m_sub_partition[dest_spid]->get_id() == dest_global_spid);
-    if (!m_sub_partition[dest_spid]->dram_L2_queue_full()) {
+    if (!dram_mee_queue_full()) {
       if (mf_return->get_access_type() == L1_WRBK_ACC) {
         m_sub_partition[dest_spid]->set_done(mf_return);
         delete mf_return;
       } else {
-        m_sub_partition[dest_spid]->dram_L2_queue_push(mf_return);
+        dram_mee_queue_push(mf_return);
         mf_return->set_status(IN_PARTITION_DRAM_TO_L2_QUEUE,
                               m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
         m_arbitration_metadata.return_credit(dest_spid);
@@ -316,6 +426,9 @@ void memory_partition_unit::dram_cycle() {
 
   m_dram->cycle();
   m_dram->dram_log(SAMPLELOG);
+  
+  mee_to_dram_cycle();
+  dram_to_mee_cycle();
 
   // mem_fetch *mf = m_sub_partition[spid]->L2_dram_queue_top();
   // if( !m_dram->full(mf->is_write()) ) {
@@ -326,12 +439,15 @@ void memory_partition_unit::dram_cycle() {
        p++) {
     int spid = (p + last_issued_partition + 1) %
                m_config->m_n_sub_partition_per_memory_channel;
-    if (!m_sub_partition[spid]->L2_dram_queue_empty() &&
+    if (!mee_dram_queue_empty() &&
         can_issue_to_dram(spid)) {
-      mem_fetch *mf = m_sub_partition[spid]->L2_dram_queue_top();
+      mem_fetch *mf = mee_dram_queue_top();
+    
+      if (global_sub_partition_id_to_local_id(mf->get_sub_partition_id()) != spid) continue;
+
       if (m_dram->full(mf->is_write())) break;
 
-      m_sub_partition[spid]->L2_dram_queue_pop();
+      mee_dram_queue_pop();
       MEMPART_DPRINTF(
           "Issue mem_fetch request %p from sub partition %d to dram\n", mf,
           spid);
@@ -356,6 +472,22 @@ void memory_partition_unit::dram_cycle() {
     mem_fetch *mf = m_dram_latency_queue.front().req;
     m_dram_latency_queue.pop_front();
     m_dram->push(mf);
+
+    if (mf->get_access_type() == META_WRBK_ACC) 
+      m_cache_meta_wb++;
+    else if (mf->get_data_type() == NORM) 
+      m_cache_NORM_acc++;
+    else if (mf->get_data_type() == CTR)
+      m_cache_CTR_acc++;
+    else if (mf->get_data_type() == MAC)
+      m_cache_MAC_acc++;
+    else if (mf->get_data_type() == BMT)
+      m_cache_BMT_acc++;
+    // if (mf->get_sub_partition_id() == 0)
+      // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "to dram", mf_return->get_addr(), mf_return->get_sub_partition_id(), mf_return->get_partition_addr(), mf_return->get_access_type());
+    
+      // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "to dram", mf->get_addr(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type());
+
   }
 }
 
@@ -365,6 +497,7 @@ void memory_partition_unit::set_done(mem_fetch *mf) {
   assert(m_sub_partition[spid]->get_id() == global_spid);
   if (mf->get_access_type() == L1_WRBK_ACC ||
       mf->get_access_type() == L2_WRBK_ACC) {
+    m_n_mf[mf->get_data_type()]--;
     m_arbitration_metadata.return_credit(spid);
     MEMPART_DPRINTF(
         "mem_fetch request %p return from dram to sub partition %d\n", mf,
@@ -375,9 +508,9 @@ void memory_partition_unit::set_done(mem_fetch *mf) {
 
 void memory_partition_unit::set_dram_power_stats(
     unsigned &n_cmd, unsigned &n_activity, unsigned &n_nop, unsigned &n_act,
-    unsigned &n_pre, unsigned &n_rd, unsigned &n_wr, unsigned &n_req) const {
+    unsigned &n_pre, unsigned &n_rd, unsigned &n_wr, unsigned &n_wr_WB, unsigned &n_req) const {
   m_dram->set_dram_power_stats(n_cmd, n_activity, n_nop, n_act, n_pre, n_rd,
-                               n_wr, n_req);
+                               n_wr, n_wr_WB, n_req);
 }
 
 void memory_partition_unit::print(FILE *fp) const {
@@ -401,6 +534,45 @@ void memory_partition_unit::print(FILE *fp) const {
   m_dram->print(fp);
 }
 
+void memory_partition_unit::accumulate_METAcache_stats(
+    class cache_stats &l2_stats, char META[]) const {
+  class meta_cache *m_METAcache;
+  if (strcmp(META, "CTR") == 0) {
+    m_METAcache = m_CTRcache;
+  } else if (strcmp(META, "MAC") == 0) {
+    m_METAcache = m_MACcache;
+  } else if (strcmp(META, "BMT") == 0) {
+    m_METAcache = m_BMTcache;
+  } else {
+    // 如果 s 不是预期的值,可以在这里添加错误处理逻辑
+    assert(0);
+  }
+  if (!m_config->m_META_config.disabled()) {
+    l2_stats += m_METAcache->get_stats();
+  }
+}
+
+void memory_partition_unit::get_METAcache_sub_stats(
+    struct cache_sub_stats &css, char META[]) const {
+  class meta_cache *m_METAcache;
+  if (strcmp(META, "CTR") == 0) {
+    m_METAcache = m_CTRcache;
+  } else if (strcmp(META, "MAC") == 0) {
+    m_METAcache = m_MACcache;
+  } else if (strcmp(META, "BMT") == 0) {
+    m_METAcache = m_BMTcache;
+  } else {
+    // 如果 s 不是预期的值,可以在这里添加错误处理逻辑
+    assert(0);
+  }
+  if (!m_config->m_META_config.disabled()) {
+    m_METAcache->get_sub_stats(css);
+  }
+}
+
+counterMap *memory_partition_unit::get_ctrModificationCount() { return m_mee->get_ctrModCount(); }
+
+
 memory_sub_partition::memory_sub_partition(unsigned sub_partition_id,
                                            const memory_config *config,
                                            class memory_stats_t *stats,
@@ -416,12 +588,25 @@ memory_sub_partition::memory_sub_partition(unsigned sub_partition_id,
   char L2c_name[32];
   snprintf(L2c_name, 32, "L2_bank_%03d", m_id);
   m_L2interface = new L2interface(this);
+  // m_metainterface = new metainterface(this);
   m_mf_allocator = new partition_mf_allocator(config);
 
-  if (!m_config->m_L2_config.disabled())
+  if (!m_config->m_L2_config.disabled()) {
     m_L2cache =
         new l2_cache(L2c_name, m_config->m_L2_config, -1, -1, m_L2interface,
                      m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
+    // m_CTRcache =
+    //     new l2_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
+    //                  m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
+    // m_MACcache =
+    //     new l2_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
+    //                  m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
+    // m_BMTcache =
+    //     new l2_cache(L2c_name, m_config->m_META_config, -1, -1, m_metainterface,
+    //                  m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
+  }
+
+  // m_sub_mee = new sub_mee(this, m_CTRcache, m_MACcache, m_BMTcache, m_config, m_gpu);
 
   unsigned int icnt_L2;
   unsigned int L2_dram;
@@ -430,26 +615,32 @@ memory_sub_partition::memory_sub_partition(unsigned sub_partition_id,
   sscanf(m_config->gpgpu_L2_queue_config, "%u:%u:%u:%u", &icnt_L2, &L2_dram,
          &dram_L2, &L2_icnt);
   m_icnt_L2_queue = new fifo_pipeline<mem_fetch>("icnt-to-L2", 0, icnt_L2);
-  m_L2_dram_queue = new fifo_pipeline<mem_fetch>("L2-to-dram", 0, L2_dram);
-  m_dram_L2_queue = new fifo_pipeline<mem_fetch>("dram-to-L2", 0, dram_L2);
+  m_L2_mee_queue = new fifo_pipeline<mem_fetch>("L2-to-mee", 0, L2_dram);
+  // m_mee_dram_queue = new fifo_pipeline<mem_fetch>("mee-to-dram", 0, L2_dram);
+  // m_dram_mee_queue = new fifo_pipeline<mem_fetch>("dram-to-mee", 0, dram_L2);
+  m_mee_L2_queue = new fifo_pipeline<mem_fetch>("mee-to-L2", 0, dram_L2);
   m_L2_icnt_queue = new fifo_pipeline<mem_fetch>("L2-to-icnt", 0, L2_icnt);
   wb_addr = -1;
 }
 
 memory_sub_partition::~memory_sub_partition() {
   delete m_icnt_L2_queue;
-  delete m_L2_dram_queue;
-  delete m_dram_L2_queue;
+  delete m_L2_mee_queue;
+  delete m_mee_L2_queue;
   delete m_L2_icnt_queue;
   delete m_L2cache;
   delete m_L2interface;
 }
 
 void memory_sub_partition::cache_cycle(unsigned cycle) {
+  // printf("memory_partition_unit cycle: %d\n", cycle);
   // L2 fill responses
   if (!m_config->m_L2_config.disabled()) {
     if (m_L2cache->access_ready() && !m_L2_icnt_queue->full()) {
       mem_fetch *mf = m_L2cache->next_access();
+            // if (mf->get_access_type() == 9)
+      // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill responses:\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+
       if (mf->get_access_type() !=
           L2_WR_ALLOC_R) {  // Don't pass write allocate read request back to
                             // upper level cache
@@ -474,31 +665,41 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
   }
 
   // DRAM to L2 (texture) and icnt (not texture)
-  if (!m_dram_L2_queue->empty()) {
-    mem_fetch *mf = m_dram_L2_queue->top();
+  if (!m_mee_L2_queue->empty()) {
+    mem_fetch *mf = m_mee_L2_queue->top();
+                // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 fill:\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+
+    // assert(mf_return->get_access_type() != 4);
     if (!m_config->m_L2_config.disabled() && m_L2cache->waiting_for_fill(mf)) {
+      assert(mf->get_access_type() != 4);
       if (m_L2cache->fill_port_free()) {
+        assert(mf->get_access_type() != 4);
+
         mf->set_status(IN_PARTITION_L2_FILL_QUEUE,
                        m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
         m_L2cache->fill(mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
                                 m_memcpy_cycle_offset);
-        m_dram_L2_queue->pop();
+        m_mee_L2_queue->pop();
       }
     } else if (!m_L2_icnt_queue->full()) {
       if (mf->is_write() && mf->get_type() == WRITE_ACK)
         mf->set_status(IN_PARTITION_L2_TO_ICNT_QUEUE,
                        m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
       m_L2_icnt_queue->push(mf);
-      m_dram_L2_queue->pop();
+      m_mee_L2_queue->pop();
     }
   }
 
-  // prior L2 misses inserted into m_L2_dram_queue here
+  // m_mee->simple_cycle(cycle);
+  
+  // prior L2 misses inserted into m_L2_mee_queue here
   if (!m_config->m_L2_config.disabled()) m_L2cache->cycle();
 
   // new L2 texture accesses and/or non-texture accesses
-  if (!m_L2_dram_queue->full() && !m_icnt_L2_queue->empty()) {
+  if (!m_L2_mee_queue->full() && !m_icnt_L2_queue->empty()) {
     mem_fetch *mf = m_icnt_L2_queue->top();
+                // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 access\t", mf->get_addr(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type());
+
     if (!m_config->m_L2_config.disabled() &&
         ((m_config->m_L2_texure_only && mf->istexture()) ||
          (!m_config->m_L2_texure_only))) {
@@ -512,6 +713,7 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
                               m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
                                   m_memcpy_cycle_offset,
                               events);
+
         bool write_sent = was_write_sent(events);
         bool read_sent = was_read_sent(events);
         MEM_SUBPART_DPRINTF("Probing L2 cache Address=%llx, status=%u\n",
@@ -541,10 +743,15 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
                m_config->m_L2_config.m_write_alloc_policy ==
                    LAZY_FETCH_ON_READ) &&
               !was_writeallocate_sent(events)) {
-            mf->set_reply();
-            mf->set_status(IN_PARTITION_L2_TO_ICNT_QUEUE,
-                           m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
-            m_L2_icnt_queue->push(mf);
+            if (mf->get_access_type() == L1_WRBK_ACC) {
+              m_request_tracker.erase(mf);
+              delete mf;
+            } else {
+              mf->set_reply();
+              mf->set_status(IN_PARTITION_L2_TO_ICNT_QUEUE,
+                             m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
+              m_L2_icnt_queue->push(mf);
+            }
           }
           // L2 cache accepted request
           m_icnt_L2_queue->pop();
@@ -558,7 +765,7 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
       // L2 is disabled or non-texture access to texture-only L2
       mf->set_status(IN_PARTITION_L2_TO_DRAM_QUEUE,
                      m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
-      m_L2_dram_queue->push(mf);
+      m_L2_mee_queue->push(mf);
       m_icnt_L2_queue->pop();
     }
   }
@@ -580,22 +787,73 @@ bool memory_sub_partition::full(unsigned size) const {
   return m_icnt_L2_queue->is_avilable_size(size);
 }
 
-bool memory_sub_partition::L2_dram_queue_empty() const {
-  return m_L2_dram_queue->empty();
+// interface to L2_mee_queue
+
+bool memory_partition_unit::L2_mee_queue_empty(unsigned spid) const {
+  return m_sub_partition[spid]->m_L2_mee_queue->empty(); // TODO
+}
+
+class mem_fetch *memory_partition_unit::L2_mee_queue_top(unsigned spid) const {
+  return m_sub_partition[spid]->m_L2_mee_queue->top(); // TODO
+}
+
+void memory_partition_unit::L2_mee_queue_pop(unsigned spid) { m_sub_partition[spid]->m_L2_mee_queue->pop(); } // TODO
+
+// interface to mee_dram_queue
+
+bool memory_partition_unit::mee_dram_queue_empty() const {
+  return m_mee_dram_queue[TOT]->empty(); // TODO
 }
 
-class mem_fetch *memory_sub_partition::L2_dram_queue_top() const {
-  return m_L2_dram_queue->top();
+class mem_fetch *memory_partition_unit::mee_dram_queue_top() const {
+  return m_mee_dram_queue[TOT]->top(); // TODO
 }
 
-void memory_sub_partition::L2_dram_queue_pop() { m_L2_dram_queue->pop(); }
+void memory_partition_unit::mee_dram_queue_pop() { m_mee_dram_queue[TOT]->pop(); } // TODO
 
-bool memory_sub_partition::dram_L2_queue_full() const {
-  return m_dram_L2_queue->full();
+bool memory_partition_unit::mee_dram_queue_full(enum data_type dtype) const {
+  return m_mee_dram_queue[dtype]->full(); //TODO
 }
 
-void memory_sub_partition::dram_L2_queue_push(class mem_fetch *mf) {
-  m_dram_L2_queue->push(mf);
+bool memory_partition_unit::mee_dram_queue_full(int size, enum data_type dtype) const {
+  return m_mee_dram_queue[dtype]->full(size); //TODO
+}
+
+void memory_partition_unit::mee_dram_queue_push(class mem_fetch *mf, enum data_type dtype) {
+  if (get_mpid() == 0) {
+        // printf("%saddr: %x\twr: %d\tdata_type: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\tcycle: %lld\n", "mee to dram push:\t", mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id(), m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);        // print_tag();
+    }
+  m_mee_dram_queue[dtype]->push(mf); //TODO
+}
+
+// interface to dram_mee_queue
+
+bool memory_partition_unit::dram_mee_queue_empty(enum data_type dtype) const {
+  return m_dram_mee_queue[dtype]->empty(); // TODO
+}
+
+class mem_fetch *memory_partition_unit::dram_mee_queue_top(enum data_type dtype) const {
+  return m_dram_mee_queue[dtype]->top(); // TODO
+}
+
+void memory_partition_unit::dram_mee_queue_pop(enum data_type dtype) { m_dram_mee_queue[dtype]->pop(); } // TODO
+
+bool memory_partition_unit::dram_mee_queue_full() const {
+  return m_dram_mee_queue[TOT]->full(); //TODO
+}
+
+void memory_partition_unit::dram_mee_queue_push(class mem_fetch *mf) {
+  m_dram_mee_queue[TOT]->push(mf); //TODO
+}
+
+// interface to mee_L2_queue
+
+bool memory_partition_unit::mee_L2_queue_full(unsigned spid) const {
+  return m_sub_partition[spid]->m_mee_L2_queue->full(); //TODO
+}
+
+void memory_partition_unit::mee_L2_queue_push(unsigned spid, class mem_fetch *mf) {
+  m_sub_partition[spid]->m_mee_L2_queue->push(mf); //TODO
 }
 
 void memory_sub_partition::print_cache_stat(unsigned &accesses,
@@ -646,6 +904,7 @@ void gpgpu_sim::print_dram_stats(FILE *fout) const {
   unsigned pre = 0;
   unsigned rd = 0;
   unsigned wr = 0;
+  unsigned wr_WB = 0;
   unsigned req = 0;
   unsigned tot_cmd = 0;
   unsigned tot_nop = 0;
@@ -657,13 +916,13 @@ void gpgpu_sim::print_dram_stats(FILE *fout) const {
 
   for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
     m_memory_partition_unit[i]->set_dram_power_stats(cmd, activity, nop, act,
-                                                     pre, rd, wr, req);
+                                                     pre, rd, wr, wr_WB, req);
     tot_cmd += cmd;
     tot_nop += nop;
     tot_act += act;
     tot_pre += pre;
     tot_rd += rd;
-    tot_wr += wr;
+    tot_wr += wr + wr_WB;
     tot_req += req;
   }
   fprintf(fout, "gpgpu_n_dram_reads = %d\n", tot_rd);
@@ -677,14 +936,16 @@ void gpgpu_sim::print_dram_stats(FILE *fout) const {
 
 unsigned memory_sub_partition::flushL2() {
   if (!m_config->m_L2_config.disabled()) {
-    m_L2cache->flush();
+    m_L2cache->flush();//TODO
+    // m_CTRcache->flush();
   }
   return 0;  // TODO: write the flushed data to the main memory
 }
 
 unsigned memory_sub_partition::invalidateL2() {
   if (!m_config->m_L2_config.disabled()) {
-    m_L2cache->invalidate();
+    m_L2cache->invalidate();//TODO
+    // m_CTRcache->invalidate();
   }
   return 0;
 }
@@ -694,71 +955,68 @@ bool memory_sub_partition::busy() const { return !m_request_tracker.empty(); }
 std::vector<mem_fetch *>
 memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
   std::vector<mem_fetch *> result;
-
+  mem_access_sector_mask_t sector_mask = mf->get_access_sector_mask();
   if (mf->get_data_size() == SECTOR_SIZE &&
       mf->get_access_sector_mask().count() == 1) {
     result.push_back(mf);
-  } else if (mf->get_data_size() == 128 || mf->get_data_size() == 64) {
-    // We only accept 32, 64 and 128 bytes reqs
-    unsigned start = 0, end = 0;
-    if (mf->get_data_size() == 128) {
+  } else if (mf->get_data_size() == MAX_MEMORY_ACCESS_SIZE) {
+    // break down every sector
+    mem_access_byte_mask_t mask;
+    for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
+      for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
+        mask.set(k);
+      }
+      mem_fetch *n_mf = m_mf_allocator->alloc(
+          mf->get_addr() + SECTOR_SIZE * i, mf->get_access_type(),
+          mf->get_access_warp_mask(), mf->get_access_byte_mask() & mask,
+          std::bitset<SECTOR_CHUNCK_SIZE>().set(i), SECTOR_SIZE, mf->is_write(),
+          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf->get_wid(),
+          mf->get_sid(), mf->get_tpc(), mf);
+
+      result.push_back(n_mf);
+    }
+    // This is for constant cache
+  } else if (mf->get_data_size() == 64 &&
+             (mf->get_access_sector_mask().all() ||
+              mf->get_access_sector_mask().none())) {
+    unsigned start;
+    if (mf->get_addr() % MAX_MEMORY_ACCESS_SIZE == 0)
       start = 0;
-      end = 3;
-    } else if (mf->get_data_size() == 64 &&
-               mf->get_access_sector_mask().to_string() == "1100") {
+    else
       start = 2;
-      end = 3;
-    } else if (mf->get_data_size() == 64 &&
-               mf->get_access_sector_mask().to_string() == "0011") {
-      start = 0;
-      end = 1;
-    } else if (mf->get_data_size() == 64 &&
-               (mf->get_access_sector_mask().to_string() == "1111" ||
-                mf->get_access_sector_mask().to_string() == "0000")) {
-      if (mf->get_addr() % 128 == 0) {
-        start = 0;
-        end = 1;
-      } else {
-        start = 2;
-        end = 3;
+    mem_access_byte_mask_t mask;
+    for (unsigned i = start; i < start + 2; i++) {
+      for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
+        mask.set(k);
       }
-    } else {
-      printf(
-          "Invalid sector received, address = 0x%06llx, sector mask = %s, data "
-          "size = %d",
-          mf->get_addr(), mf->get_access_sector_mask(), mf->get_data_size());
-      assert(0 && "Undefined sector mask is received");
-    }
-
-    std::bitset<SECTOR_SIZE * SECTOR_CHUNCK_SIZE> byte_sector_mask;
-    byte_sector_mask.reset();
-    for (unsigned k = start * SECTOR_SIZE; k < SECTOR_SIZE; ++k)
-      byte_sector_mask.set(k);
-
-    for (unsigned j = start, i = 0; j <= end; ++j, ++i) {
-      const mem_access_t *ma = new mem_access_t(
-          mf->get_access_type(), mf->get_addr() + SECTOR_SIZE * i, SECTOR_SIZE,
-          mf->is_write(), mf->get_access_warp_mask(),
-          mf->get_access_byte_mask() & byte_sector_mask,
-          std::bitset<SECTOR_CHUNCK_SIZE>().set(j), m_gpu->gpgpu_ctx);
-
-      mem_fetch *n_mf =
-          new mem_fetch(*ma, NULL, mf->get_ctrl_size(), mf->get_wid(),
-                        mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
-                        m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf);
+      mem_fetch *n_mf = m_mf_allocator->alloc(
+          mf->get_addr(), mf->get_access_type(), mf->get_access_warp_mask(),
+          mf->get_access_byte_mask() & mask,
+          std::bitset<SECTOR_CHUNCK_SIZE>().set(i), SECTOR_SIZE, mf->is_write(),
+          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf->get_wid(),
+          mf->get_sid(), mf->get_tpc(), mf);
 
       result.push_back(n_mf);
-      byte_sector_mask <<= SECTOR_SIZE;
     }
   } else {
-    printf(
-        "Invalid sector received, address = 0x%06llx, sector mask = %d, byte "
-        "mask = , data size = %u",
-        mf->get_addr(), mf->get_access_sector_mask().count(),
-        mf->get_data_size());
-    assert(0 && "Undefined data size is received");
+    for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
+      if (sector_mask.test(i)) {
+        mem_access_byte_mask_t mask;
+        for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
+          mask.set(k);
+        }
+        mem_fetch *n_mf = m_mf_allocator->alloc(
+            mf->get_addr() + SECTOR_SIZE * i, mf->get_access_type(),
+            mf->get_access_warp_mask(), mf->get_access_byte_mask() & mask,
+            std::bitset<SECTOR_CHUNCK_SIZE>().set(i), SECTOR_SIZE,
+            mf->is_write(), m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,
+            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf);
+
+        result.push_back(n_mf);
+      }
+    }
   }
-
+  if (result.size() == 0) assert(0 && "no mf sent");
   return result;
 }
 
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 3152db337..74bd45186 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -31,6 +32,7 @@
 
 #include "../abstract_hardware_model.h"
 #include "dram.h"
+#include "gpu-cache.h"
 
 #include <list>
 #include <queue>
@@ -51,6 +53,13 @@ class partition_mf_allocator : public mem_fetch_allocator {
   virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
                            unsigned size, bool wr,
                            unsigned long long cycle) const;
+  virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
+                           const active_mask_t &active_mask,
+                           const mem_access_byte_mask_t &byte_mask,
+                           const mem_access_sector_mask_t &sector_mask,
+                           unsigned size, bool wr, unsigned long long cycle,
+                           unsigned wid, unsigned sid, unsigned tpc,
+                           mem_fetch *original_mf) const;
 
  private:
   const memory_config *m_memory_config;
@@ -68,6 +77,8 @@ class memory_partition_unit {
 
   bool busy() const;
 
+  void dram_to_mee_cycle();
+  void mee_to_dram_cycle();
   void cache_cycle(unsigned cycle);
   void dram_cycle();
   void simple_dram_model_cycle();
@@ -78,6 +89,8 @@ class memory_partition_unit {
   void print_stat(FILE *fp) { m_dram->print_stat(fp); }
   void visualize() const { m_dram->visualize(); }
   void print(FILE *fp) const;
+  void accumulate_METAcache_stats(class cache_stats &l2_stats, char META[]) const;
+  void get_METAcache_sub_stats(struct cache_sub_stats &css, char META[]) const;
   void handle_memcpy_to_gpu(size_t dst_start_addr, unsigned subpart_id,
                             mem_access_sector_mask_t mask);
 
@@ -88,7 +101,7 @@ class memory_partition_unit {
   // Power model
   void set_dram_power_stats(unsigned &n_cmd, unsigned &n_activity,
                             unsigned &n_nop, unsigned &n_act, unsigned &n_pre,
-                            unsigned &n_rd, unsigned &n_wr,
+                            unsigned &n_rd, unsigned &n_wr, unsigned &n_wr_WB,
                             unsigned &n_req) const;
 
   int global_sub_partition_id_to_local_id(int global_sub_partition_id) const;
@@ -99,13 +112,70 @@ class memory_partition_unit {
     return m_gpu;
   }
 
+  bool L2_mee_queue_empty(unsigned spid) const;
+  class mem_fetch *L2_mee_queue_top(unsigned spid) const;
+  void L2_mee_queue_pop(unsigned spid);
+
+  bool mee_dram_queue_empty() const;
+  class mem_fetch *mee_dram_queue_top() const;
+  void mee_dram_queue_pop();
+  bool mee_dram_queue_full(enum data_type dtype) const;
+  bool mee_dram_queue_full(int size, enum data_type dtype) const;
+  void mee_dram_queue_push(class mem_fetch *mf, enum data_type dtype);
+
+  bool dram_mee_queue_empty(enum data_type dtype) const;
+  class mem_fetch *dram_mee_queue_top(enum data_type dtype) const;
+  void dram_mee_queue_pop(enum data_type dtype);
+  bool dram_mee_queue_full() const;
+  void dram_mee_queue_push(class mem_fetch *mf);
+
+  void mee_L2_queue_push(unsigned spid, class mem_fetch *mf);
+  bool mee_L2_queue_full(unsigned spid) const;
+
+  class memory_sub_partition **m_sub_partition;
+  
  private:
   unsigned m_id;
   const memory_config *m_config;
   class memory_stats_t *m_stats;
-  class memory_sub_partition **m_sub_partition;
+  // class memory_sub_partition **m_sub_partition;
   class dram_t *m_dram;
 
+  class meta_cache *m_CTRcache;
+  class meta_cache *m_MACcache;
+  class meta_cache *m_BMTcache;
+  class mee *m_mee;
+  // class metainterface *m_metainterface;
+  class metainterface *m_BMTinterface;
+  class metainterface *m_CTRinterface;
+  class metainterface *m_MACinterface;
+  partition_mf_allocator *m_mf_allocator;
+
+ public:
+  unsigned long long m_cache_NORM_acc;
+  unsigned long long m_cache_CTR_acc;
+  unsigned long long m_cache_MAC_acc;
+  unsigned long long m_cache_BMT_acc;
+  unsigned long long m_cache_meta_wb;
+  void pop_n_mf(enum data_type dtype) { m_n_mf[dtype]--; }
+
+ private:
+  fifo_pipeline<mem_fetch> *m_mee_dram_queue[5]; 
+  fifo_pipeline<mem_fetch> *m_dram_mee_queue[5]; 
+  unsigned m_n_mf[5] = {0, 0, 0, 0, 0};
+  const unsigned send_trigger_threshold = 64;
+  const unsigned receive_stop_threshold = 64;
+  unsigned last_send = 0;
+  // fifo_pipeline<mem_fetch> *m_NORM_dram_queue; 
+  // fifo_pipeline<mem_fetch> *m_CTR_dram_queue; 
+  // fifo_pipeline<mem_fetch> *m_MAC_dram_queue; 
+  // fifo_pipeline<mem_fetch> *m_BMT_dram_queue;
+
+  // fifo_pipeline<mem_fetch> *m_dram_NORM_queue;
+  // fifo_pipeline<mem_fetch> *m_dram_CTR_queue;
+  // fifo_pipeline<mem_fetch> *m_dram_MAC_queue;
+  // fifo_pipeline<mem_fetch> *m_dram_BMT_queue; 
+
   class arbitration_metadata {
    public:
     arbitration_metadata(const memory_config *config);
@@ -146,6 +216,10 @@ class memory_partition_unit {
   std::list<dram_delay_t> m_dram_latency_queue;
 
   class gpgpu_sim *m_gpu;
+
+ public:
+  counterMap *get_ctrModificationCount();
+  friend class mee;
 };
 
 class memory_sub_partition {
@@ -170,14 +244,30 @@ class memory_sub_partition {
   unsigned flushL2();
   unsigned invalidateL2();
 
-  // interface to L2_dram_queue
-  bool L2_dram_queue_empty() const;
-  class mem_fetch *L2_dram_queue_top() const;
-  void L2_dram_queue_pop();
+  // interface to L2_mee_queue
+  bool L2_mee_queue_empty() const;
+  class mem_fetch *L2_mee_queue_top() const;
+  void L2_mee_queue_pop();
+
+  // interface to mee_dram_queue
+  bool mee_dram_queue_full() const;
+  void mee_dram_queue_push(class mem_fetch *mf);
 
-  // interface to dram_L2_queue
-  bool dram_L2_queue_full() const;
-  void dram_L2_queue_push(class mem_fetch *mf);
+  bool mee_dram_queue_empty() const;
+  class mem_fetch *mee_dram_queue_top() const;
+  void mee_dram_queue_pop();
+
+  // interface to dram_mee_queue
+  bool dram_mee_queue_full() const;
+  void dram_mee_queue_push(class mem_fetch *mf);
+
+  bool dram_mee_queue_empty() const;
+  class mem_fetch *dram_mee_queue_top() const;
+  void dram_mee_queue_pop();
+
+  // interface to mee_L2_queue
+  bool mee_L2_queue_full() const;
+  void mee_L2_queue_push(class mem_fetch *mf);
 
   void visualizer_print(gzFile visualizer_file);
   void print_cache_stat(unsigned &accesses, unsigned &misses) const;
@@ -195,13 +285,28 @@ class memory_sub_partition {
     m_L2cache->force_tag_access(addr, m_memcpy_cycle_offset + time, mask);
     m_memcpy_cycle_offset += 1;
   }
+  // class l2_cache *m_CTRcache;
+  std::vector<mem_fetch *> breakdown_request_to_sector_requests(mem_fetch *mf);
 
+  // these are various FIFOs between units within a memory partition
+  fifo_pipeline<mem_fetch> *m_icnt_L2_queue;
+  fifo_pipeline<mem_fetch> *m_L2_mee_queue;
+  // fifo_pipeline<mem_fetch> *m_mee_dram_queue; 
+  // fifo_pipeline<mem_fetch> *m_dram_mee_queue; 
+  fifo_pipeline<mem_fetch> *m_mee_L2_queue;
+  fifo_pipeline<mem_fetch> *m_L2_icnt_queue;  // L2 cache hit response queue
+  
  private:
   // data
   unsigned m_id;  //< the global sub partition ID
   const memory_config *m_config;
   class l2_cache *m_L2cache;
   class L2interface *m_L2interface;
+  // class l2_cache *m_CTRcache;
+  // class l2_cache *m_MACcache;
+  // class l2_cache *m_BMTcache;
+  // class mee *m_mee;
+  // class metainterface *m_metainterface;
   class gpgpu_sim *m_gpu;
   partition_mf_allocator *m_mf_allocator;
 
@@ -212,12 +317,6 @@ class memory_sub_partition {
   };
   std::queue<rop_delay_t> m_rop;
 
-  // these are various FIFOs between units within a memory partition
-  fifo_pipeline<mem_fetch> *m_icnt_L2_queue;
-  fifo_pipeline<mem_fetch> *m_L2_dram_queue;
-  fifo_pipeline<mem_fetch> *m_dram_L2_queue;
-  fifo_pipeline<mem_fetch> *m_L2_icnt_queue;  // L2 cache hit response queue
-
   class mem_fetch *L2dramout;
   unsigned long long int wb_addr;
 
@@ -226,8 +325,9 @@ class memory_sub_partition {
   std::set<mem_fetch *> m_request_tracker;
 
   friend class L2interface;
+  friend class metainterface;
 
-  std::vector<mem_fetch *> breakdown_request_to_sector_requests(mem_fetch *mf);
+  // std::vector<mem_fetch *> breakdown_request_to_sector_requests(mem_fetch *mf);
 
   // This is a cycle offset that has to be applied to the l2 accesses to account
   // for the cudamemcpy read/writes. We want GPGPU-Sim to only count cycles for
@@ -244,15 +344,47 @@ class L2interface : public mem_fetch_interface {
   virtual ~L2interface() {}
   virtual bool full(unsigned size, bool write) const {
     // assume read and write packets all same size
-    return m_unit->m_L2_dram_queue->full();
+    return m_unit->m_L2_mee_queue->full();
   }
   virtual void push(mem_fetch *mf) {
     mf->set_status(IN_PARTITION_L2_TO_DRAM_QUEUE, 0 /*FIXME*/);
-    m_unit->m_L2_dram_queue->push(mf);
+    m_unit->m_L2_mee_queue->push(mf);
+    // if (mf->get_access_type() == 9)
+    // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\twr: %d\taccess type:%d\n", "L2 to mee:", mf->get_addr(), mf->get_sid(), mf->get_is_write(), mf->get_partition_addr(), mf->get_access_type());
+
+    // printf("l2 to mee access type: %d\n",mf->get_access_type());
   }
 
  private:
   memory_sub_partition *m_unit;
 };
 
+class metainterface : public mem_fetch_interface {
+ public:
+  // metainterface(memory_partition_unit *unit, enum cache_type dtype) { 
+  metainterface(fifo_pipeline<mem_fetch> *pipeline) { 
+    // m_unit = unit;
+    // m_dtype = dtype;
+    this->pipeline = pipeline;
+  }
+  virtual ~metainterface() {}
+  virtual bool full(unsigned size, bool write) const {
+    // assume read and write packets all same size
+    // return m_unit->mee_dram_queue_full();
+    return pipeline->full();
+  }
+  virtual void push(mem_fetch *mf) {
+    mf->set_status(IN_PARTITION_L2_TO_DRAM_QUEUE, 0 /*FIXME*/);
+    // printf("%saddr: %x\tmf_type: %d\tsp_addr: %x\taccess type:%d\n", "mee to dram:\t", mf->get_addr(), mf->get_data_type(), mf->get_partition_addr(), mf->get_access_type());
+
+    // m_unit->mee_dram_queue_push(mf);
+    pipeline->push(mf);
+  }
+
+ private:
+  memory_partition_unit *m_unit;
+  enum cache_type m_dtype;
+  fifo_pipeline<mem_fetch> *pipeline;
+};
+
 #endif
diff --git a/src/gpgpu-sim/mee.cc b/src/gpgpu-sim/mee.cc
new file mode 100644
index 000000000..f0fed8f8d
--- /dev/null
+++ b/src/gpgpu-sim/mee.cc
@@ -0,0 +1,938 @@
+#include "mee.h"
+#include <list>
+#define BMT_Enable
+#define MAC_Enable
+
+mee::mee(class memory_partition_unit *unit, class meta_cache *CTRcache, class meta_cache *MACcache, class meta_cache *BMTcache, const memory_config *config, class gpgpu_sim *gpu) : 
+    m_unit(unit), 
+    m_CTRcache(CTRcache),
+    m_MACcache(MACcache),
+    m_BMTcache(BMTcache),
+    m_config(config),
+    m_gpu(gpu) {
+    unsigned len = 64;
+    m_CTR_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
+    m_Ciphertext_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
+    m_MAC_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
+    m_BMT_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
+
+    m_CTR_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
+    m_MAC_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
+    m_BMT_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
+    m_Ciphertext_RET_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len + 100);
+
+    m_OTP_queue = new fifo_pipeline<unsigned>("meta-queue", m_config->m_crypto_latency, m_config->m_crypto_latency + len);
+    m_AES_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 4);
+
+    m_HASH_queue = new fifo_pipeline<hash>("meta-queue", m_config->m_crypto_latency, m_config->m_crypto_latency + len);
+    m_MAC_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, 4);
+
+    // m_HASH_queue = new fifo_pipeline<unsigned>("meta-queue", 40, 40 + len);
+    m_BMT_CHECK_queue = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
+    m_CTR_BMT_Buffer = new fifo_pipeline<mem_fetch>("meta-queue", 0, len);
+
+    m_ctrModCount = new counterMap;
+
+    BMT_busy = false;
+}
+int decode(int addr) {
+    return (addr & 16128) >> 8;
+}
+void mee::print_addr(char s[], mem_fetch *mf) {
+    // if (m_unit->get_mpid() == 14) {
+    //     printf("%s\t", s);
+    //     if (mf->get_original_mf())
+    //         printf("original_addr: %x\toriginal_sp_addr: %x\t", mf->get_original_mf()->get_addr(), mf->get_original_mf()->get_partition_addr());
+    //     printf("addr: %x\twr: %d\tdata_type: %d\tBMT_Layer: %d\tsp_id: %d\tsp_addr: %x\taccess type:%d\tmf_id: %d\tcycle: %d\n", mf->get_addr(),mf->is_write(), mf->get_data_type(), mf->get_BMT_Layer(), mf->get_sub_partition_id(), mf->get_partition_addr(), mf->get_access_type(), mf->get_id(), m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);        // print_tag();
+    // }
+}
+
+void mee::print_status(class meta_cache *m_METAcache, mem_fetch *mf) {
+    // if (m_unit->get_mpid() == 14) {
+    //     unsigned idx = m_METAcache->m_config.set_index(mf->get_addr());
+    //     enum cache_request_status status = m_METAcache->m_tag_array->probe(mf->get_addr(), idx, mf->get_access_sector_mask(), mf->is_write());
+    //     printf("idx is %u\t", idx);
+    //     printf("sector mask is %u\n", mf->get_access_sector_mask().to_ulong());
+    //     m_METAcache->m_tag_array->m_lines[idx]->print_status();
+    // }
+}
+
+void mee::print_tag() {
+    // if (get_sub_partition_id(mf) == 0) {
+        // for (unsigned i = 0; i < m_config->m_META_config.get_num_lines(); i++) {
+        for (unsigned i = 188; i < 192; i++) {
+            // printf("line %d:\t", i);
+            // for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++)
+            //     // printf("%d\t", 
+            //     m_CTRcache->m_tag_array->m_lines[i]->print_status();
+            // printf("\n");
+        }
+    // }
+}
+
+new_addr_type mee::get_partition_addr(mem_fetch *mf) {
+    new_addr_type partition_addr = mf->get_addr() >> (8 + 6) << 8;
+    partition_addr |= mf->get_addr() & ((1 << 8) - 1);
+    return mf->get_partition_addr();
+}
+
+new_addr_type mee::get_sub_partition_id(mem_fetch *mf) {
+    // return (mf->get_addr() >> 8) & ((1 << 6) - 1);
+    
+    return mf->get_sub_partition_id();
+}
+
+unsigned int mee::get_BMT_Layer(new_addr_type addr) {
+    for (int i = 0; i <= 4; i++) {
+        if ((addr & BMT_mask[i]) == BMT_base[i]) {
+            return i;
+        }
+    }
+    return 5;
+}
+
+bool mee::META_queue_empty() {
+    return m_CTR_queue->empty() && m_Ciphertext_queue->empty() && m_MAC_queue->empty();
+}
+
+new_addr_type mee::get_addr(new_addr_type sub_partition_id, new_addr_type partition_addr) {
+    new_addr_type new_addr = partition_addr >> 8 << (8 + 6);
+    new_addr |= partition_addr & ((1 << 8) - 1);
+    new_addr |= sub_partition_id << 8;
+    return new_addr;
+}
+
+void mee::gen_CTR_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned size, unsigned mf_id) {
+    new_addr_type partition_addr = get_partition_addr(mf);
+    new_addr_type sub_partition_id = get_sub_partition_id(mf);
+    // new_addr_type minor_addr = (partition_addr >> 7) & 127;
+    // minor_addr = 128 + minor_addr * 7;
+    // bool res = minor_addr & 7 > 1;
+    // minor_addr >>= 3;
+    partition_addr = (partition_addr >> 7);
+
+    // if (meta_acc == META_ACC)
+    //     partition_addr |= minor_addr;
+
+    new_addr_type CTR_addr  = get_addr(sub_partition_id, partition_addr);
+    CTR_addr |= CTR_base;
+    if (wr)
+        (*m_ctrModCount)[CTR_addr]++;
+
+    // if (meta_acc == META_ACC && res)
+    //     size <<= 1;
+
+    meta_access(m_CTR_queue, CTR_addr, meta_acc, 
+            size, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
+            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf, mf_id, CTR, DEFAULT);
+}
+
+void mee::gen_MAC_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned size, unsigned mf_id) {
+    new_addr_type partition_addr = get_partition_addr(mf);
+    new_addr_type sub_partition_id = get_sub_partition_id(mf);
+    if (m_config->m_META_config.m_cache_type == SECTOR)
+        partition_addr = partition_addr >> 6 << 2;
+    else
+        partition_addr = partition_addr >> 7 << 3;
+    new_addr_type MAC_addr  = get_addr(sub_partition_id, partition_addr);
+    MAC_addr |= MAC_base;
+
+    meta_access(m_MAC_queue, MAC_addr, meta_acc, 
+            size, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
+            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf, mf_id, MAC, DEFAULT);
+}
+
+void mee::gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned size, unsigned mf_id) {
+    new_addr_type partition_addr = get_partition_addr(mf);
+    new_addr_type sub_partition_id = get_sub_partition_id(mf);
+    // unsigned int Layer = get_BMT_Layer(mf->get_addr());
+    // if (Layer == 4) //由L4生成ROOT，由于ROOT是单独的寄存器，这里不生成访存请求
+    //     return;
+    partition_addr = partition_addr & 0x003fffff;
+    if (size == 128)
+        partition_addr = partition_addr >> 11 << 7;
+    else
+        partition_addr = partition_addr >> 9 << 5;
+    new_addr_type BMT_addr  = get_addr(sub_partition_id, partition_addr);
+    BMT_addr |= 0xF2000000;
+
+    enum BMT_Layer BMT_type = static_cast<BMT_Layer>(mf->get_BMT_Layer() + 1);
+
+    meta_access(m_BMT_queue, BMT_addr, meta_acc, 
+            size, wr, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
+            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf, mf_id, BMT, BMT_type);
+}
+
+void mee::meta_access(
+        fifo_pipeline<mem_fetch> *m_META_queue, new_addr_type addr, mem_access_type type, unsigned size, bool wr,
+        unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc,
+        mem_fetch *original_mf, unsigned mf_id, enum data_type m_data_type, enum BMT_Layer m_Layer) const {
+
+    mem_access_byte_mask_t byte_mask;
+    mem_access_sector_mask_t sector_mask;
+    unsigned data_size = 0;
+    if (size == 128) {
+        for (unsigned i = 0; i < size / 32; i++) 
+            sector_mask.set(i);
+        addr = addr >> 7 << 7;
+        for (unsigned i = addr & 127; i < (addr & 127) + size; i++) byte_mask.set(i);
+        data_size = 128;
+    }
+    else {
+        for (unsigned i = (addr >> 5) & 3; i < ((addr >> 5) & 3) + ((size + 31) / 32); i++) 
+            sector_mask.set(i);
+        addr = addr >> 5 << 5;
+        for (unsigned i = addr & 127; i < (addr & 127) + size; i++) byte_mask.set(i);
+        data_size = 32;
+        // sector_mask.set((addr >> 5) & 3);
+    }
+    mem_access_t acc(type, addr, data_size, wr, original_mf->get_access_warp_mask(), byte_mask, sector_mask, m_gpu->gpgpu_ctx);
+    mem_fetch *mf = new mem_fetch(
+        acc, NULL /*we don't have an instruction yet*/, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE,
+        wid, sid, tpc, m_config, cycle, original_mf);
+
+    std::vector<mem_fetch *> reqs;
+    if (m_config->m_META_config.m_cache_type == SECTOR)
+        reqs = m_unit->m_sub_partition[0]->breakdown_request_to_sector_requests(mf);
+    else
+        reqs.push_back(mf);
+
+    assert(m_data_type != MAC || reqs.size() == 1);
+
+    for (unsigned i = 0; i < reqs.size(); ++i) {
+        // assert(reqs.size() == 1);
+        mem_fetch *req = reqs[i];
+        // req->set_id(mf_id);
+        req->set_data_type(m_data_type);
+        req->set_BMT_Layer(m_Layer);
+        if (i == reqs.size() - 1)
+            req->set_id(mf_id);
+        else
+            req->set_id(0);
+        assert(!m_META_queue->full());
+        m_META_queue->push(req);
+    }
+}
+
+void mee::CT_cycle() {
+    if (!m_Ciphertext_RET_queue->empty()) {
+        mem_fetch *mf_return = m_Ciphertext_RET_queue->top();
+        int spid = m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id());
+        // if (mf_return->get_access_type() != L1_WR_ALLOC_R && mf_return->get_access_type() != L2_WR_ALLOC_R) {
+        if (mf_return->is_write()) { // write
+        // assert(!mf_return->is_write());
+            // print_addr("mee to L2 W:\t", mf_return);
+            if (!m_unit->mee_L2_queue_full(spid)){
+                // assert(!mf_return->is_write());
+                // assert(mf_return->get_access_type() != 4);
+                m_unit->mee_L2_queue_push(spid, mf_return); //写密文完成，返回L2
+                m_Ciphertext_RET_queue->pop();
+            // } else  {
+            //     assert(mf_return->get_access_type() != 4);
+            }
+        } else if (!m_AES_queue->full() && !m_HASH_queue->full()) {              // read
+            m_AES_queue->push(mf_return);   //密文从DRAM返回，送往AES解密
+            // m_MAC_table[(new_addr_type)mf_return] = ++MAC_counter;
+            // assert(m_MAC_table[(new_addr_type)mf_return]);
+            // if (m_unit->get_mpid() == 0)
+            //     printf("HASH :%d\n", mf_return->get_id());
+            m_HASH_queue->push(new hash(MAC, mf_return->get_id()));         //从DRAM中取到密文，对密文进行MAC Hash
+            m_Ciphertext_RET_queue->pop();
+        }
+    }
+
+    if (!m_Ciphertext_queue->empty()) {
+        mem_fetch *mf = m_Ciphertext_queue->top();
+        // print_addr("L2 to mee:\t", mf);
+        if (mf->is_write()) { // write
+        // assert(!mf->is_write());
+            if (mf->is_raw() && !m_AES_queue->full()) {
+                // assert(!mf->is_write());
+                // printf("QQQQQQQQQQQQQQQQ\n");
+                m_AES_queue->push(mf);  //写密文请求，将明文送入AES中解密
+                mf->set_cooked_status();
+                // m_MAC_table[(new_addr_type)mf] = ++MAC_counter;
+                // assert(m_MAC_table[(new_addr_type)mf]);
+                // m_HASH_queue->push(new unsigned(mf->get_id()));         //加密完后得到密文，对密文进行MAC Hash
+                // m_Ciphertext_queue->pop();   //加密完后才可以生成访存
+            } else {
+                if (!mf->is_raw()) {
+                    // printf("RRRRRRRRRRRRRRR");
+                }
+                if (m_AES_queue->full()) {
+                    // printf("SSSSSSSSSSSSSSSSSSS");
+                }
+            }
+        } else if (!m_unit->mee_dram_queue_full(NORM)) {              // read
+            m_unit->mee_dram_queue_push(mf, NORM);    //读密文请求，发往DRAM中读密文
+            m_Ciphertext_queue->pop();
+            CT_counter++;
+        }
+    }
+}
+
+void mee::AES_cycle() {
+    if (!m_AES_queue->empty()) {
+        mem_fetch *mf = m_AES_queue->top();
+        new_addr_type REQ_addr = (new_addr_type) mf;    //加密/解密请求的明文/密文
+        unsigned OTP_id = mf->get_id(); //OTP
+        int spid = m_unit->global_sub_partition_id_to_local_id(mf->get_sub_partition_id());
+        // if (mf->get_sub_partition_id() == 0) 
+        //     printf("%x\n", OTP_addr);
+        // print_addr("waiting for AES:\t", mf);
+        assert(OTP_id);
+        // if (mf->is_write())
+        //     printf("PPPPPPPPPPPPPP\n");
+        if (m_OTP_set[OTP_id]) {  // 得到了OTP和明文/密文，AES加密/解密完成 
+            if (mf->is_write()) {   //加密
+            // assert(!mf->is_write());
+                // printf("OOOOOOOOOOOOOOOOOOOOOO\n");
+                if (!m_unit->mee_dram_queue_full(NORM) && !m_HASH_queue->full()) {
+                    m_OTP_set[OTP_id]--;
+                    m_unit->mee_dram_queue_push(mf, NORM);    //加密完后更新DRAM中的密文
+                    CT_counter++;
+                    m_HASH_queue->push(new hash(MAC, mf->get_id()));          //加密完后得到密文，对密文进行MAC Hash
+                    m_AES_queue->pop();
+                    m_Ciphertext_queue->pop();  //写密文发往DRAM
+                }
+            } else if (!m_unit->mee_L2_queue_full(spid)) {  //解密
+                m_OTP_set[OTP_id]--;
+                // m_OTP_table[REQ_addr] = 0;
+                // print_addr("mee to L2 R:\t", mf);
+                m_unit->mee_L2_queue_push(spid, mf);    //解密完后返回L2
+                print_addr("MEE to L2:\t", mf);
+                // printf("JJJJJJJJJJJJJJJJJJJJJJJJJ");
+                m_AES_queue->pop();
+                
+            } else {
+                // printf("IIIIIIIIIIIIIIII\n");
+            }
+        } else {
+            // print_addr("waiting for AES:\t", mf);
+            // if (mf->is_write()) 
+            //     printf("%p %d AES waiting for OTP %d\n", mf, mf->get_sub_partition_id(), OTP_id);
+        }
+    }
+
+    if (!m_OTP_queue->empty()){
+        unsigned *mf = m_OTP_queue->top();
+        if (mf) {
+            m_OTP_set[*mf]++; //OTP计算完成
+        }
+        // delete mf;
+        m_OTP_queue->pop();
+    }
+}
+
+void mee::MAC_CHECK_cycle() {
+    if (!m_MAC_CHECK_queue->empty()) {
+        // printf("AAAAAAAAAAAAA\n");
+        mem_fetch *mf = m_MAC_CHECK_queue->top();
+        unsigned HASH_id = mf->get_id();    //MAC Hash值
+        assert(HASH_id);
+        if (m_MAC_set[HASH_id]) { //得到了MAC与Hash值，MAC Check完成
+            // if (m_unit->get_mpid() == 12)
+            // printf("MAC check: id %d sid %d\n", HASH_id, mf->get_sub_partition_id());
+            m_MAC_set[HASH_id]--;
+            // m_MAC_table[REQ_addr] = 0;
+            m_MAC_CHECK_queue->pop();
+            // printf("%p %d MAC HASH %d\n", mf, mf->get_sub_partition_id(), HASH_id);
+        } else {
+            // print_addr("waiting for MAC Check:\t", mf);
+            // if (mf->get_sub_partition_id() == 32) 
+                // printf("%p %d MAC waiting for HASH %d\n", mf, mf->get_sub_partition_id(), HASH_id);
+        }
+    }
+
+    if (!m_HASH_queue->empty()) {
+        // printf("BBBBBBBBBBBBBBB\n");
+        hash *mf = m_HASH_queue->top();
+        if (mf) {
+            // if (m_unit->get_mpid() == 0)
+            //     printf("type:%d HASH :%d\n", mf->first, mf->get_id());
+            if (mf->first == MAC)
+                m_MAC_set[mf->second]++; //MAC Hash计算完成
+            if (mf->first == BMT)
+                m_BMT_set[mf->second]++; //BMT Hash计算完成
+            m_HASH_queue->pop();
+        }
+        // delete mf;
+        else 
+            m_HASH_queue->pop();
+    }
+}
+
+void mee::BMT_CHECK_cycle() {
+    if (!m_BMT_CHECK_queue->empty()) {
+        // printf("AAAAAAAAAAAAA\n");
+        mem_fetch *mf = m_BMT_CHECK_queue->top();
+        new_addr_type REQ_addr = (new_addr_type) mf;    //BMT Cache的值
+        unsigned HASH_id = mf->get_id();    //BMT Hash值
+        assert(mf->get_access_type() != META_RBW);
+        // if (mf->get_sub_partition_id() == 0) 
+        //     printf("%x\n", OTP_addr);
+        // assert(mf);
+        if (m_BMT_set[HASH_id] && ((m_config->m_META_config.m_cache_type == SECTOR && !m_BMT_queue->full(2)) || (m_config->m_META_config.m_cache_type != SECTOR && !m_BMT_queue->full(2)))) { //得到了BMT与Hash值，BMT Check完成, 计算下一层BMT
+            m_BMT_set[HASH_id]--;
+            m_BMT_CHECK_queue->pop();
+            print_addr("BMT Hash:\t", mf);
+            //计算下一层BMT
+            if (mf->get_BMT_Layer() == BMT_L4) {
+                // printf("AAAAAAAAAAAA\n");
+                BMT_busy = false;
+                m_n_reqs_in_BMT--;
+                if (mf->get_id())
+                    BMT_counter++;
+            } else {
+                if (mf->is_write()) {
+                    if (m_config->m_META_config.m_cache_type == SECTOR) {
+                        gen_BMT_mf(mf, mf->is_write(), META_ACC, 2, HASH_id); // Lazy fetch on read策略下，写操作不会发给dram
+                        assert(!m_BMT_queue->full());
+                        gen_BMT_mf(mf, false, META_ACC, 32, HASH_id);
+                    } else {
+                        gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, HASH_id); // Lazy fetch on read策略下，写操作不会发给dram
+                        assert(!m_BMT_queue->full());
+                        gen_BMT_mf(mf, false, META_ACC, 128, HASH_id);
+                    }
+                } else {
+                    if (m_config->m_META_config.m_cache_type == SECTOR) {
+                        gen_BMT_mf(mf, false, META_ACC, 32, HASH_id);
+                    } else {
+                        gen_BMT_mf(mf, false, META_ACC, 128, HASH_id);
+                    }
+                }
+            }
+            // if (m_unit->get_mpid() == 13)
+            //     printf("BMT_queue size = %d\n", m_BMT_queue->get_n_element());
+        }
+    }
+
+    // if (!m_HASH_queue->empty()) {
+    //     // printf("BBBBBBBBBBBBBBB\n");
+    //     hash *mf = m_HASH_queue->top();
+    //     if (mf) {
+    //         if (mf->first == BMT)
+    //             m_BMT_set[mf->first]++; //BMT Hash计算完成
+    //     }
+    //     // delete mf;
+    //     else
+    //         m_HASH_queue->pop();
+    // }
+
+    // CTR to BMT
+    if (!m_CTR_BMT_Buffer->empty() && m_n_reqs_in_BMT < 64 && !m_HASH_queue->full()) {
+        // assert(cnt);
+        mem_fetch *mf = m_CTR_BMT_Buffer->top();
+            // gen_BMT_mf(mf, mf->is_write(), META_ACC, 8, mf->get_id());
+        print_addr("CTR to BMT:\t", mf);
+        // if (m_unit->get_mpid() == 13)
+        //     printf("BMT_CHECK_queue size = %d\n", m_BMT_CHECK_queue->get_n_element());
+        m_n_reqs_in_BMT++;
+        m_BMT_CHECK_queue->push(mf);
+        m_HASH_queue->push(new hash(BMT, mf->get_id()));
+        m_CTR_BMT_Buffer->pop();
+        BMT_busy = true;
+    }
+}
+
+void mee::CTR_cycle() {
+    if (!m_CTR_RET_queue->empty()) {
+        mem_fetch *mf_return = m_CTR_RET_queue->top();
+        if (!mf_return->get_id() || mf_return->get_access_type() == META_RBW) {    //更新CTR前的CTR读MISS返回
+            m_CTR_RET_queue->pop();
+            // delete mf_return;//删除1
+        } else {    //CTR读MISS返回，CTR写一定命中
+            assert(!mf_return->is_write());
+                // print_addr("MISS OTP:\t\t", mf_return);
+            if (!m_OTP_queue->full()) { //CTR读MISS，则应生成CTR to BMT任务
+                m_OTP_queue->push(new unsigned(mf_return->get_id()));   //得到CTR值，计算OTP用于解密
+                m_CTR_RET_queue->pop();
+            }
+        }
+    }
+
+    m_CTRcache->cycle();
+    CT_cycle();
+    
+    bool output_full = m_OTP_queue->full() || m_CTR_RET_queue->full() || m_CTR_BMT_Buffer->full();
+    bool port_free = m_unit->m_CTRcache->data_port_free();
+
+    if (!m_CTR_queue->empty() && !m_unit->mee_dram_queue_full(CTR) && !output_full && port_free) {
+        mem_fetch *mf = m_CTR_queue->top();
+        // print_addr("CTR cycle access:\t\t", mf);
+
+        if (mf->is_write()) {
+            if (m_CTRcache->probe(mf->get_addr(), mf) != HIT) {//读到CTR后，才可以CTR++，然后写CTR
+                return;
+            }
+        }
+
+        std::list<cache_event> events;
+        enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
+        bool write_sent = was_write_sent(events);
+        bool read_sent = was_read_sent(events);
+        if (status == HIT) {
+            m_CTR_queue->pop();
+            if (mf->is_write()) {   //CTR更新了，BMT也要更新，生成CTR to BMT任务
+                #ifdef BMT_Enable
+                // print_addr("CTR Write:\t", mf);
+                if (mf->get_id())
+                    m_CTR_BMT_Buffer->push(mf);
+                if (mf->get_id())
+                    CTR_counter++;
+                #endif
+            }
+            else if (mf->get_access_type() != META_RBW) {
+                if (mf->get_id())
+                    m_OTP_queue->push(new unsigned(mf->get_id()));  //CTR HIT后计算OTP用于加密/解密
+                if (mf->get_id())
+                    OTP_counter++;
+            }
+            // }
+        } else if (status != RESERVATION_FAIL) {
+            // set wating for CTR fill
+            print_addr("CTR MISS:\t", mf);
+            m_CTR_queue->pop();
+            assert(!mf->is_write());
+            if (mf->get_access_type() != META_RBW) {
+                if (mf->get_id())
+                    OTP_counter++;
+                #ifdef BMT_Enable
+                if (mf->get_id())
+                    CTR_counter++;
+                #endif
+            }
+        } else {
+            assert(!write_sent);
+            assert(!read_sent);
+        }
+    }
+
+    // m_CTRcache->cycle();
+};
+
+void mee::MAC_cycle() {
+    if (!m_MAC_RET_queue->empty()) {
+        mem_fetch *mf_return = m_MAC_RET_queue->top();
+        if (mf_return->is_write()) {    //写MAC完成
+            m_MAC_RET_queue->pop();
+            // delete mf_return;//删除2
+        } else {    //MAC读MISS返回
+            assert(!mf_return->is_write());
+            if (!m_MAC_CHECK_queue->full()) {
+                m_MAC_CHECK_queue->push(mf_return); //MAC读MISS完成，得到MAC值，发往MAC Check
+                m_MAC_RET_queue->pop();
+            }
+        }
+    }
+
+    m_MACcache->cycle();
+
+    bool output_full = m_MAC_CHECK_queue->full() || m_MAC_RET_queue->full();// && 
+    bool port_free = m_unit->m_MACcache->data_port_free();
+    
+    if (!m_MAC_queue->empty() && !m_unit->mee_dram_queue_full(MAC) && !output_full && port_free) {
+        mem_fetch *mf = m_MAC_queue->top();
+        // print_addr("MAC cycle access:\t\t", mf);
+
+        assert(mf->get_id());
+
+        if (mf->is_write()) {   //对于写MAC请求，则应等待密文被Hash为新MAC值
+            if (!m_MAC_set[mf->get_id()]) {
+                return;
+            }
+        }
+
+        std::list<cache_event> events;
+        enum cache_request_status status = m_MACcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
+        bool write_sent = was_write_sent(events);
+        bool read_sent = was_read_sent(events);
+        // print_addr("CTR cycle access:\t\t", mf);
+        if (status == HIT) {
+            if (mf->is_write()) {   //MAC写HIT，则MAC Hash值使用结束
+                // m_MAC_set[mf->get_id()]--;
+            } else {
+                m_MAC_CHECK_queue->push(mf);    //MAC读HIT，得到MAC值，发往MAC Check
+            }
+            print_addr("MAC cycle access HIT:\t", mf);
+            print_status(m_MACcache, mf);
+            m_MAC_queue->pop();
+            MAC_counter++;
+            // }
+        } else if (status != RESERVATION_FAIL) {
+            // set wating for CTR fill
+            print_addr("MAC cycle access MISS:\t", mf);
+            print_status(m_MACcache, mf);
+            if (mf->is_write()) {   //MAC写MISS，则MAC Hash值使用结束
+                // m_MAC_set[mf->get_id()]--;
+            }
+            m_MAC_queue->pop();
+            MAC_counter++;
+        } else {
+            print_addr("MAC cycle RESERVATION_FAIL:\t", mf);
+            print_status(m_MACcache, mf);
+            // m_MACcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
+            // if (get_sub_partition_id(mf) == 0)
+            //     enum cache_request_status status = m_CTRcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
+            // print_addr("MAC cycle RESERVATION_FAIL:\t", mf);
+            assert(!write_sent);
+            assert(!read_sent);
+        }
+    }
+};
+
+void mee::BMT_cycle() {
+    if (!m_BMT_RET_queue->empty()) {
+        mem_fetch *mf_return = m_BMT_RET_queue->top();
+        // print_addr("MISS OTP:\t\t", mf_return);
+        if (mf_return->get_id() && !mf_return->is_write()) {
+            if (!m_BMT_CHECK_queue->full() && !m_HASH_queue->full()) {
+                m_BMT_CHECK_queue->push(mf_return);
+                m_HASH_queue->push(new hash(BMT, mf_return->get_id()));
+                m_BMT_RET_queue->pop();
+            }
+        } else {
+            m_BMT_RET_queue->pop();
+        }
+    }
+
+    m_BMTcache->cycle();
+
+    bool output_full = m_BMT_CHECK_queue->full() || m_BMT_RET_queue->full() || m_HASH_queue->full();
+    bool port_free = m_unit->m_BMTcache->data_port_free();
+    
+    if (!m_BMT_queue->empty()) {
+        mem_fetch *mf = m_BMT_queue->top();
+        // assert(mf->get_access_type() == META_RBW);
+    }
+
+    if (!m_BMT_queue->empty() && !m_unit->mee_dram_queue_full(BMT) && !output_full && port_free) {
+        mem_fetch *mf = m_BMT_queue->top();
+        // print_addr("BMT waiting access:\t", mf);
+        // assert(mf->get_access_type() == mf->get_access_type());
+
+        // if (mf->get_access_type() == META_RBW) {
+        //     //对于BMT写，要等待上一层BMT Hash计算完，得到新的BMT值，才可以更新当前层BMT
+        //     if (m_BMTcache->probe(mf->get_addr(), mf) != HIT) {//读到CTR后，才可以CTR++，然后写CTR
+        //         return;
+        //     }
+        // }
+
+        std::list<cache_event> events;
+        enum cache_request_status status = m_BMTcache->access(mf->get_addr(), mf, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
+        bool write_sent = was_write_sent(events);
+        bool read_sent = was_read_sent(events);
+        // print_addr("CTR cycle access:\t\t", mf);
+        if (status == HIT) {
+            // print_addr("BMT access HIT:\t", mf);
+            if (mf->get_id() && !mf->is_write()) {
+                m_BMT_CHECK_queue->push(mf);
+                m_HASH_queue->push(new hash(BMT, mf->get_id()));
+            }
+            m_BMT_queue->pop();
+        } else if (status != RESERVATION_FAIL) {
+            // print_addr("BMT access MISS:\t", mf);
+            m_BMT_queue->pop();
+        } else {
+            // print_addr("BMT access reservation_fail:\t", mf);
+            assert(!write_sent);
+            assert(!read_sent);
+        }
+    }
+};
+
+void mee::META_fill_responses(class meta_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, const new_addr_type MASK) {
+    if (m_METAcache->access_ready() && !m_META_RET_queue->full()) {
+        mem_fetch *mf = m_METAcache->next_access();
+        if (mf->get_access_type() == META_ACC && mf->get_id())
+            m_META_RET_queue->push(mf);
+        // assert(mf->get_access_type() == META_ACC);
+        // if (m_METAcache == m_BMTcache)
+        print_addr("fill responses:\t", mf);
+        // reply(m_METAcache, mf);
+        // delete mf;
+    } else {
+        if (m_META_RET_queue->full()){
+            // printf("fill responses ERROR: %d\n", m_unit->get_mpid());
+        }
+    }
+}
+
+void mee::META_fill(class meta_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, mem_fetch *mf, const new_addr_type MASK, const new_addr_type BASE, enum data_type m_data_type) {
+    // if (m_METAcache == m_BMTcache) printf("%llx & %llx == %llx\n", mf->get_addr(), BASE, mf->get_addr() & BASE);
+    
+    if (!m_unit->dram_mee_queue_empty(m_data_type)) {
+        mem_fetch *mf_return = m_unit->dram_mee_queue_top(m_data_type);
+        #ifdef BMT_Enable
+        if (m_data_type == CTR && mf_return->get_access_type() == META_ACC)
+            if (!m_META_RET_queue->full()) 
+                m_META_RET_queue->push(mf_return);
+            else
+                return;
+        #endif
+        if ((mf_return->get_data_type() == m_data_type) && m_METAcache->waiting_for_fill(mf_return)) {
+            // print_addr("wating for fill:\t\t", mf); 
+            if (m_METAcache->fill_port_free()) {
+                // assert(mf->get_access_type() != META_WR_ALLOC_R);
+                print_addr("fill: \t\t", mf_return);
+                m_METAcache->fill(mf_return, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle +
+                                        m_memcpy_cycle_offset);
+                if (m_METAcache == m_MACcache)
+                    print_status(m_METAcache, mf_return);
+                //     print_addr("MAC fill:\t", mf);
+                assert(!mf_return->is_write());
+                // if (m_METAcache == m_BMTcache)
+                //     print_addr("fill:\t\t\t\t", mf);
+                    // printf("%llx & %llx == %llx\n", mf->get_addr(), BASE, mf->get_addr() & BASE); 
+                // if (mf->get_sub_partition_id() == 1) { 
+                //     printf("CTR Fill: %p\n", mf);
+                //     // printf("CTR Next: %p\n", m_CTR_queue->top());
+                // }
+                m_unit->dram_mee_queue_pop(m_data_type);
+            } else {
+                // print_addr("fill ERROR:\t", mf_return);
+            }
+        } else if (mf_return->get_data_type() == m_data_type) {
+            if (mf_return->is_write() && mf_return->get_type() == WRITE_ACK)
+                mf_return->set_status(IN_PARTITION_L2_TO_ICNT_QUEUE,
+                            m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
+            //   m_META_RET_queue->push(mf);
+            m_unit->dram_mee_queue_pop(m_data_type);
+        }
+    }
+}
+
+void mee::pr(fifo_pipeline<mem_fetch> *m_META_RET_queue) {
+    printf("%d\n",m_META_RET_queue->get_length());
+}
+
+void mee::simple_cycle(unsigned cycle) {
+    // printf("AAAAAAAAAAAAAAAAAAAAAA");
+    // pr(m_CTR_BMT_Buffer);
+    // META Cache fill responses
+    META_fill_responses(m_CTRcache, m_CTR_RET_queue, CTR_mask);
+    META_fill_responses(m_MACcache, m_MAC_RET_queue, MAC_mask);
+    // for (int layer = 1; layer <= 4; layer++){
+    META_fill_responses(m_BMTcache, m_BMT_RET_queue, BMT_mask[1]);
+    // }
+    // META_fill_responses(m_BMTcache);
+    META_fill(m_CTRcache, m_CTR_BMT_Buffer, NULL, CTR_mask, CTR_base, CTR);
+    META_fill(m_MACcache, m_MAC_RET_queue, NULL, MAC_mask, MAC_base, MAC);
+    META_fill(m_BMTcache, m_BMT_RET_queue, NULL, BMT_mask[1], BMT_base[1], BMT);
+
+    // dram to mee
+    if (!m_unit->dram_mee_queue_empty(NORM)) {
+        mem_fetch *mf_return = m_unit->dram_mee_queue_top(NORM);
+        // assert(!mf_return->is_write());
+        // if (mf_return->get_sub_partition_id() == 58)
+        // print_addr("waiting for fill:\t", mf_return);
+        // printf("%saddr: %x\tdata_type: %d\tsp_addr: %x\taccess type:%d\n", "fill queue:\t", mf->get_addr(), mf->get_data_type(), mf->get_partition_addr(), mf->get_access_type());
+
+        if (false
+            // mf_return->get_access_type() == L1_WR_ALLOC_R || 
+            // // mf_return->get_access_type() == L2_WR_ALLOC_R ||
+            // mf_return->get_access_type() == L1_WRBK_ACC || 
+            // mf_return->get_access_type() == L2_WRBK_ACC
+            ) {
+                assert(mf_return->get_access_type() == 4 && !mf_return->is_write());
+            m_unit->dram_mee_queue_pop(NORM);
+        } else {
+        
+            print_addr("dram to mee:\t", mf_return);
+            // mee to L2
+            
+            // META_fill(m_MACcache, mf_return, MAC_mask);
+            // META_fill(m_BMTcache, mf_return);
+            // if (!m_unit->mee_L2_queue_full()) {
+            // reply L2 read
+            // reply L2 write back
+            //m_unit->mee_L2_queue_push(m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id()), mf_return);
+            int spid = m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id());
+            assert(mf_return->get_access_type() < META_ACC);
+            if (!m_Ciphertext_RET_queue->full()) {              
+                // m_AES_queue->push(mf_return);   //密文从DRAM返回，送往AES解密
+                // m_MAC_table[(new_addr_type)mf_return] = ++MAC_counter;
+                // assert(m_MAC_table[(new_addr_type)mf_return]);
+                // m_HASH_queue->push(new unsigned(m_MAC_table[(new_addr_type)mf_return]));  //对密文进行hash，用于MAC Check
+                m_Ciphertext_RET_queue->push(mf_return);
+                m_unit->dram_mee_queue_pop(NORM);
+                // printf("HHHHHHHHHHHHHHHH");
+            } else {
+                // printf("HHHHHHHHHHHHHHHH");
+            }
+        }
+    } else if (!m_unit->mee_dram_queue_empty()) {
+        // printf("SSSSSSSSSSSSSSS %d\n", );
+    }
+    // printf("L2 to mee queue: %d %d\n", m_unit->m_sub_partition[0]->m_L2_mee_queue->empty(), m_unit->m_sub_partition[0]->m_L2_mee_queue->empty());
+    // L2 to mee
+    if (!m_unit->L2_mee_queue_empty(cycle&1)) {
+        mem_fetch *mf = m_unit->L2_mee_queue_top(cycle&1);
+        // print_addr("waiting for access:\t", mf);
+        // if (mf->get_access_type() == 9)
+                        // printf("%saddr: %x\tsp_id: %d\tsp_addr: %x\taccess type:%d\n", "L2 to mee:\t", mf->get_addr(), mf->get_sid(), mf->get_partition_addr(), mf->get_access_type());
+
+        
+        // mee to dram
+        assert(mf->is_raw());
+        // printf("TTTTTTTTTTTTTTTT\n");
+        
+        if (((m_config->m_META_config.m_cache_type == SECTOR && !m_CTR_queue->full(8)) || (m_config->m_META_config.m_cache_type != SECTOR && !m_CTR_queue->full(2)))
+            && !m_MAC_queue->full() && !m_Ciphertext_queue->full()) {
+            print_addr("L2 to mee: ", mf);
+            DL_CNT = 0;
+            // assert(!mf->is_write());
+            if (mf->is_write()) { // write
+                assert(mf->is_raw());
+                // printf("LLLLLLLLLLLLLLLLLLL");
+                // if (!m_Ciphertext_queue->full()) {
+                mf_counter++;
+                mf->set_id(mf_counter);
+
+                // gen_CTR_mf(mf, false, META_RBW, 16, mf_counter);//Lazy_ftech_on_read
+                // gen_CTR_mf(mf, false, META_ACC,  1, mf_counter);//Lazy_ftech_on_read
+                // gen_CTR_mf(mf, true,  META_RBW, 16, mf_counter);
+                // gen_CTR_mf(mf, true,  META_ACC,  1, mf_counter);
+                // gen_CTR_mf(mf, false, META_ACC, 128, mf_counter);//Lazy_ftech_on_read
+                // gen_CTR_mf(mf, true,  META_ACC, 128, mf_counter);
+
+                if (m_config->m_META_config.m_cache_type == SECTOR) {
+                    gen_CTR_mf(mf, false, META_ACC, 32, mf_counter);//Lazy_ftech_on_read
+                    gen_CTR_mf(mf, true,  META_ACC, 32, mf_counter);
+                }
+                else {
+                    gen_CTR_mf(mf, false, META_ACC, 128, mf_counter);//Lazy_ftech_on_read
+                    gen_CTR_mf(mf, true,  META_ACC, 128, mf_counter);
+                }
+
+                #ifdef MAC_Enable
+                if (m_config->m_META_config.m_cache_type == SECTOR)
+                    gen_MAC_mf(mf, true, META_ACC, 4, mf_counter);
+                else
+                    gen_MAC_mf(mf, true, META_ACC, 8, mf_counter);
+                #endif
+
+                // m_AES_queue->push(mf);  //写密文请求，将明文送入AES中解密
+                m_Ciphertext_queue->push(mf);
+                m_unit->L2_mee_queue_pop(cycle&1);
+                // mf->set_cooked_status();
+                // printf("BBBBBBBBBBBBBBBBB");
+                // }
+            } else if (!m_unit->mee_dram_queue_full(NORM)) {              // read
+                // printf("CCCCCCCCCCCCCCCC");
+                // m_unit->mee_dram_queue_push(mf);    //读密文请求，发往DRAM中读密文
+                mf_counter++;
+                mf->set_id(mf_counter);
+                m_Ciphertext_queue->push(mf);
+                if (m_config->m_META_config.m_cache_type == SECTOR) {
+                    gen_CTR_mf(mf, false, META_ACC, 32, mf_counter);
+                }
+                else {
+                    gen_CTR_mf(mf, false, META_ACC, 128, mf_counter);
+                }
+                // gen_CTR_mf(mf, false, META_ACC, 128, mf_counter);
+                #ifdef MAC_Enable
+                if (m_config->m_META_config.m_cache_type == SECTOR)
+                    gen_MAC_mf(mf, false, META_ACC, 4, mf_counter);
+                else
+                    gen_MAC_mf(mf, false, META_ACC, 8, mf_counter);
+                #endif
+                m_unit->L2_mee_queue_pop(cycle&1);
+            }
+        } else {
+            DL_CNT++;
+            if (DL_CNT >= 10000) {
+                printf("DEAD LOCK! mpid: %d\n", m_unit->get_mpid());
+            }
+            // if (m_unit->get_mpid() == 0){
+            //     if (m_CTR_RET_queue->full())
+            //         printf("AAAAAAAAAAAAAAAAAAAAAA");
+            //     if (m_MAC_RET_queue->full())
+            //         printf("BBBBBBBBBBBBBBBBB");
+            //     if (m_BMT_RET_queue->full())
+            //         printf("CCCCCCCCCCCC");
+            //     if (m_AES_queue->full())
+            //         printf("DDDDDDDDDDDDDDDD");
+            //     if (m_AES_queue->full())
+            //         printf("EEEEEEEEEEEEEEEE");
+            //     if (m_unit->mee_dram_queue_empty())
+            //         printf("FFFFFFFFFFFFFFFFFF");
+            // }
+                
+        }
+    } else {
+        // printf("GGGGGGGGGGGGGG\n");
+    }
+    MAC_CHECK_cycle();
+    MAC_cycle();
+    BMT_CHECK_cycle();
+    BMT_cycle();
+    AES_cycle();
+    CTR_cycle();
+    // CT_cycle();
+}
+
+void mee::cycle(unsigned cycle) {
+    if (!m_unit->dram_mee_queue_empty(NORM)) {
+        mem_fetch *mf_return = m_unit->dram_mee_queue_top(NORM);
+        int spid = m_unit->global_sub_partition_id_to_local_id(mf_return->get_sub_partition_id());
+         if (false
+            // mf_return->get_is_write() ||
+            // mf_return->get_access_type() == L1_WR_ALLOC_R || 
+            // mf_return->get_access_type() == L2_WR_ALLOC_R ||
+            // mf_return->get_access_type() == L1_WRBK_ACC || 
+            // mf_return->get_access_type() == L2_WRBK_ACC
+            ) {
+                // assert(mf_return->get_access_type() == 4 && !mf_return->is_write());
+            m_unit->dram_mee_queue_pop(NORM);
+        } else {
+            if (!m_unit->mee_L2_queue_full(spid)) { 
+                // m_OTP_table[REQ_addr] = 0;
+                // print_addr("mee to L2 R:\t", mf);
+                m_unit->mee_L2_queue_push(spid, mf_return);
+                m_unit->dram_mee_queue_pop(NORM);
+                
+            }
+        }
+    }
+    if (!m_unit->L2_mee_queue_empty(cycle&1)) {
+        mem_fetch *mf = m_unit->L2_mee_queue_top(cycle&1);
+        if (!m_unit->mee_dram_queue_full(NORM)) {              
+            m_unit->mee_dram_queue_push(mf, NORM);
+            m_unit->L2_mee_queue_pop(cycle&1);
+        }
+    }
+}
+
+//BMT next Layer
+//BMT buzy
+//BMT erase
+//BMT write需要阻塞，CTR read可以连续访问 
+//BMT 写前读 ok
+
+//ok BMT
+//ok 检查写操作
+//ok 读密文在CTR访存前阻塞
+//ok 实现mf id匹配
+//ok BMT不需要每层都Check
+//ok 增加访存类型的属性
+//ok 单个HASH单元
+//ok None Sector
+//lazy_fetch_on_read不能和None_Sector混用，因为设置modified会Sector_MISS
+
+//Sector
+//deepbench
+//可配置
+//lazy_fetch_on_read
+
+//mee<-->dram queue
+//write back
+//BMT_Layer
+
+//CTR_counter <= BMT_counter 
+//CT_counter  < OTP_counter
+//MAC_counter < CT_counter
+
+
+//实现一个中间类，bridge
+//
\ No newline at end of file
diff --git a/src/gpgpu-sim/mee.h b/src/gpgpu-sim/mee.h
new file mode 100644
index 000000000..909e77a68
--- /dev/null
+++ b/src/gpgpu-sim/mee.h
@@ -0,0 +1,134 @@
+
+// class mem_fetch;
+// class memory_sub_partition;
+// class gpgpu_sim;
+// class new_addr_type;
+// class mem_access_type;
+// class memory_config;
+#include "mem_fetch.h"
+#include "l2cache.h"
+#include "shader.h"
+#include "gpu-sim.h"
+
+class mee {
+    public:
+        mee(class memory_partition_unit *unit, class meta_cache *CTRcache, class meta_cache *MACcache, class meta_cache *BMTcache, const memory_config *config, class gpgpu_sim *gpu);
+        void cycle(unsigned cycle);
+        void simple_cycle(unsigned cycle);
+        void print_addr(char s[], mem_fetch *mf);
+        void print_status(class meta_cache *m_METAcache, mem_fetch *mf);
+        void print_tag();
+        void meta_access(fifo_pipeline<mem_fetch> *m_META_queue, new_addr_type addr, mem_access_type type, 
+            unsigned size, bool wr, unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc, 
+            mem_fetch *original_mf, unsigned mf_id, enum data_type m_data_type, enum BMT_Layer m_Layer) const;
+        void CTR_cycle();
+        void MAC_cycle();
+        void BMT_cycle();
+        void AES_cycle();
+        void CT_cycle();
+        void MAC_CHECK_cycle();
+        void BMT_CHECK_cycle();
+        new_addr_type get_partition_addr(mem_fetch *mf);
+        new_addr_type get_sub_partition_id(mem_fetch *mf);
+        new_addr_type get_addr(new_addr_type partition_id, new_addr_type partition_addr);
+
+        unsigned int get_BMT_Layer(new_addr_type addr);
+        void gen_CTR_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned size, unsigned mf_id);
+        void gen_MAC_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned size, unsigned mf_id);
+        void gen_BMT_mf(mem_fetch *mf, bool wr, mem_access_type meta_acc, unsigned size, unsigned mf_id);
+        bool META_queue_empty();
+
+        void META_fill_responses(class meta_cache *m_METAcache,  fifo_pipeline<mem_fetch> *m_META_RET_queue, const new_addr_type MASK);
+        void META_fill(class meta_cache *m_METAcache, fifo_pipeline<mem_fetch> *m_META_RET_queue, mem_fetch *mf, const new_addr_type MASK, const new_addr_type BASE, enum data_type m_data_type);
+
+        bool CTR_busy();
+        bool MAC_busy();
+        bool BMT_busy;
+        void pr(fifo_pipeline<mem_fetch> *m_META_RET_queue);
+        
+
+        
+    private:
+        typedef std::pair<enum data_type, int> hash;
+        class meta_cache *m_CTRcache;
+        class meta_cache *m_MACcache;
+        class meta_cache *m_BMTcache;
+        class memory_partition_unit *m_unit;
+        const memory_config *m_config;
+        class gpgpu_sim *m_gpu;
+        fifo_pipeline<mem_fetch> *m_CTR_queue;
+        fifo_pipeline<mem_fetch> *m_Ciphertext_queue;
+        fifo_pipeline<mem_fetch> *m_MAC_queue;
+        fifo_pipeline<mem_fetch> *m_BMT_queue;
+
+        fifo_pipeline<mem_fetch> *m_CTR_RET_queue;
+        fifo_pipeline<mem_fetch> *m_MAC_RET_queue;
+        fifo_pipeline<mem_fetch> *m_BMT_RET_queue;
+        fifo_pipeline<mem_fetch> *m_Ciphertext_RET_queue;
+
+        fifo_pipeline<unsigned> *m_OTP_queue;
+        fifo_pipeline<mem_fetch> *m_AES_queue;
+        
+        fifo_pipeline<hash> *m_HASH_queue;
+        fifo_pipeline<mem_fetch> *m_MAC_CHECK_queue;
+
+        //m_CTR_BMT_Buffer-->m_BMT_CHECK_queue--|-->
+        //                |->m_HASH_queue---|
+        //              m_BMT_queue-->m_BMT_RET_queue-->
+        fifo_pipeline<mem_fetch> *m_BMT_CHECK_queue;
+        // fifo_pipeline<unsigned> *m_HASH_queue;
+        fifo_pipeline<mem_fetch> *m_CTR_BMT_Buffer;
+
+        //CTR: 1111 1110 0000 0000 0000 0000 0000 0000
+        //L1 : 1111 1111 1110 0000 0000 0000 0000 0000
+        //L2 : 1111 1111 1111 1110 0000 0000 0000 0000
+        //L3 : 1111 1111 1111 1111 1100 0000 1000 0000
+        //L4 : 1111 1111 1111 1111 1100 0000 1111 1000
+        //ROOT:1111 1111 1111 1111 1100 0000 1111 1000 
+        const new_addr_type BMT_mask[5] = {0xFE000000, 0xFFE00000, 0xFFFE0000, 0xFFFFC080, 0xFFFFC0F8};
+        
+        const new_addr_type CTR_mask = 0xFE000000;//1111 000x xxxx xxxx xxxx xxxx xxxx xxxx
+        const new_addr_type MAC_mask = 0xF0000000;//1110 xxxx xxxx xxxx xxxx xxxx xxxx x000
+        
+        //CTR: 1111 000x xxxx xxxx xxxx xxxx xxxx xxxx
+        //L1 : 1111 0010 000x xxxx xxxx xxxx xxxx x000
+        //L2 : 1111 0010 0010 000x xxxx xxxx xxxx x000
+        //L3 : 1111 0010 0010 0010 00xx xxxx 0xxx x000
+        //L4 : 1111 0010 0010 0010 00xx xxxx 1000 0000
+        //ROOT:1111 0010 0010 0010 00xx xxxx 1000 1000 
+        const new_addr_type BMT_base[5] = {0xF0000000, 0xF2000000, 0xF2200000, 0xF2220000, 0xF2220080};
+        
+        const new_addr_type CTR_base = 0xF0000000;//1111 000x xxxx xxxx xxxx xxxx xxxx xxxx
+        const new_addr_type MAC_base = 0xE0000000;//1110 xxxx xxxx xxxx xxxx xxxx xxxx x000
+
+        const int m_memcpy_cycle_offset = 0;
+        const int mee_busy_mask = 0;
+
+        typedef tr1_hash_map<new_addr_type, unsigned> table;
+        typedef tr1_hash_map<unsigned, int> set;
+        table m_OTP_table;  //<密文，OTP(CTR)>
+        set m_OTP_set;  //<OTP(CTR), cnt>
+        table m_MAC_table;  //<MAC, hash(密文)>
+        set m_MAC_set;      //<hash(密文), cnt>
+        table m_BMT_table;  //<BMT, hash(CTR/LBMT)>
+        set m_BMT_set;      //<hash, cnt>
+        //1111 1111 1111 1111 1100 0000 1111 1000
+        mem_fetch *BMT_ROOT_mf = NULL;
+        int cnt = 0;
+
+        unsigned mf_counter = 0;
+        unsigned CT_counter = 0;
+        unsigned OTP_counter = 0;
+        unsigned MAC_counter = 0;
+        unsigned CTR_counter = 0;
+        unsigned BMT_counter = 0;
+        unsigned m_n_reqs_in_BMT = 0;
+        int var;
+        unsigned DL_CNT = 0;
+
+        
+    
+    public:
+        counterMap *m_ctrModCount;
+        counterMap* get_ctrModCount() { return m_ctrModCount; }
+};
\ No newline at end of file
diff --git a/src/gpgpu-sim/mem_fetch.cc b/src/gpgpu-sim/mem_fetch.cc
index 456d891dd..9ef25d61c 100644
--- a/src/gpgpu-sim/mem_fetch.cc
+++ b/src/gpgpu-sim/mem_fetch.cc
@@ -69,6 +69,8 @@ mem_fetch::mem_fetch(const mem_access_t &access, const warp_inst_t *inst,
     m_raw_addr.chip = m_original_mf->get_tlx_addr().chip;
     m_raw_addr.sub_partition = m_original_mf->get_tlx_addr().sub_partition;
   }
+  raw_data = true;
+  id = 0;
 }
 
 mem_fetch::~mem_fetch() { m_status = MEM_FETCH_DELETED; }
diff --git a/src/gpgpu-sim/mem_fetch.h b/src/gpgpu-sim/mem_fetch.h
index e039846e3..55c01f4f5 100644
--- a/src/gpgpu-sim/mem_fetch.h
+++ b/src/gpgpu-sim/mem_fetch.h
@@ -33,6 +33,24 @@
 #include "../abstract_hardware_model.h"
 #include "addrdec.h"
 
+enum data_type {
+  TOT = 0,
+  BMT,
+  CTR,
+  NORM,
+  MAC,
+  NUM_DATA_TYPE
+};
+
+enum BMT_Layer {
+  DEFAULT = 0,
+  BMT_L1,
+  BMT_L2,
+  BMT_L3,
+  BMT_L4,
+  BMT_ROOT
+};
+
 enum mf_type {
   READ_REQUEST = 0,
   WRITE_REQUEST,
@@ -128,6 +146,18 @@ class mem_fetch {
   mem_fetch *get_original_mf() { return original_mf; }
   mem_fetch *get_original_wr_mf() { return original_wr_mf; }
 
+  bool is_raw() {return raw_data; }
+  void set_cooked_status() {raw_data = false; }
+  
+  unsigned get_id() { return this->id; }
+  void set_id(unsigned id) { this->id = id; }
+
+  enum data_type get_data_type() { return this->m_data_type; }
+  void set_data_type(enum data_type dtype) { this->m_data_type = dtype; }
+
+  enum BMT_Layer get_BMT_Layer() { return this->m_BMT_Layer; }
+  void set_BMT_Layer(enum BMT_Layer Layer) { this->m_BMT_Layer = Layer; }
+
  private:
   // request source information
   unsigned m_request_uid;
@@ -174,6 +204,10 @@ class mem_fetch {
                      // size), so the pointer refers to the original request
   mem_fetch *original_wr_mf;  // this pointer refers to the original write req,
                               // when fetch-on-write policy is used
+  bool raw_data = true;
+  unsigned id;
+  enum data_type m_data_type = NORM;
+  enum BMT_Layer m_BMT_Layer = DEFAULT;
 };
 
 #endif
diff --git a/src/gpgpu-sim/power_interface.cc b/src/gpgpu-sim/power_interface.cc
index c637d846f..63b985260 100644
--- a/src/gpgpu-sim/power_interface.cc
+++ b/src/gpgpu-sim/power_interface.cc
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -26,8 +27,10 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 
+
 #include "power_interface.h"
 
+
 void init_mcpat(const gpgpu_sim_config &config,
                 class gpgpu_sim_wrapper *wrapper, unsigned stat_sample_freq,
                 unsigned tot_inst, unsigned inst) {
@@ -38,7 +41,11 @@ void init_mcpat(const gpgpu_sim_config &config,
       config.g_power_simulation_enabled, config.g_power_trace_enabled,
       config.g_steady_power_levels_enabled, config.g_power_per_cycle_dump,
       config.gpu_steady_power_deviation, config.gpu_steady_min_period,
-      config.g_power_trace_zlevel, tot_inst + inst, stat_sample_freq);
+      config.g_power_trace_zlevel, tot_inst + inst, stat_sample_freq,  
+      config.g_power_simulation_mode, 
+      config.g_dvfs_enabled,
+      config.get_core_freq()/1000000,
+      config.num_shader());
 }
 
 void mcpat_cycle(const gpgpu_sim_config &config,
@@ -46,7 +53,7 @@ void mcpat_cycle(const gpgpu_sim_config &config,
                  class gpgpu_sim_wrapper *wrapper,
                  class power_stat_t *power_stats, unsigned stat_sample_freq,
                  unsigned tot_cycle, unsigned cycle, unsigned tot_inst,
-                 unsigned inst) {
+                 unsigned inst, bool dvfs_enabled) {
   static bool mcpat_init = true;
 
   if (mcpat_init) {  // If first cycle, don't have any power numbers yet
@@ -55,41 +62,45 @@ void mcpat_cycle(const gpgpu_sim_config &config,
   }
 
   if ((tot_cycle + cycle) % stat_sample_freq == 0) {
+    if(dvfs_enabled){
+      wrapper->set_model_voltage(1); //performance model needs to support this.
+    }
+
     wrapper->set_inst_power(
         shdr_config->gpgpu_clock_gated_lanes, stat_sample_freq,
-        stat_sample_freq, power_stats->get_total_inst(),
-        power_stats->get_total_int_inst(), power_stats->get_total_fp_inst(),
-        power_stats->get_l1d_read_accesses(),
-        power_stats->get_l1d_write_accesses(),
-        power_stats->get_committed_inst());
+        stat_sample_freq, power_stats->get_total_inst(0),
+        power_stats->get_total_int_inst(0), power_stats->get_total_fp_inst(0),
+        power_stats->get_l1d_read_accesses(0),
+        power_stats->get_l1d_write_accesses(0),
+        power_stats->get_committed_inst(0));
 
     // Single RF for both int and fp ops
-    wrapper->set_regfile_power(power_stats->get_regfile_reads(),
-                               power_stats->get_regfile_writes(),
-                               power_stats->get_non_regfile_operands());
+    wrapper->set_regfile_power(power_stats->get_regfile_reads(0),
+                               power_stats->get_regfile_writes(0),
+                               power_stats->get_non_regfile_operands(0));
 
     // Instruction cache stats
-    wrapper->set_icache_power(power_stats->get_inst_c_hits(),
-                              power_stats->get_inst_c_misses());
+    wrapper->set_icache_power(power_stats->get_inst_c_hits(0),
+                              power_stats->get_inst_c_misses(0));
 
     // Constant Cache, shared memory, texture cache
-    wrapper->set_ccache_power(power_stats->get_constant_c_hits(),
-                              power_stats->get_constant_c_misses());
+    wrapper->set_ccache_power(power_stats->get_const_accessess(0), 0); //assuming all HITS in constant cache for now
     wrapper->set_tcache_power(power_stats->get_texture_c_hits(),
                               power_stats->get_texture_c_misses());
-    wrapper->set_shrd_mem_power(power_stats->get_shmem_read_access());
+    wrapper->set_shrd_mem_power(power_stats->get_shmem_access(0));
 
     wrapper->set_l1cache_power(
-        power_stats->get_l1d_read_hits(), power_stats->get_l1d_read_misses(),
-        power_stats->get_l1d_write_hits(), power_stats->get_l1d_write_misses());
+        power_stats->get_l1d_read_hits(0), power_stats->get_l1d_read_misses(0),
+        power_stats->get_l1d_write_hits(0), power_stats->get_l1d_write_misses(0));
 
     wrapper->set_l2cache_power(
-        power_stats->get_l2_read_hits(), power_stats->get_l2_read_misses(),
-        power_stats->get_l2_write_hits(), power_stats->get_l2_write_misses());
+        power_stats->get_l2_read_hits(0), power_stats->get_l2_read_misses(0),
+        power_stats->get_l2_write_hits(0), power_stats->get_l2_write_misses(0));
 
     float active_sms = (*power_stats->m_active_sms) / stat_sample_freq;
     float num_cores = shdr_config->num_shader();
     float num_idle_core = num_cores - active_sms;
+    wrapper->set_num_cores(num_cores);
     wrapper->set_idle_core_power(num_idle_core);
 
     // pipeline power - pipeline_duty_cycle *= percent_active_sms;
@@ -101,38 +112,64 @@ void mcpat_cycle(const gpgpu_sim_config &config,
     wrapper->set_duty_cycle_power(pipeline_duty_cycle);
 
     // Memory Controller
-    wrapper->set_mem_ctrl_power(power_stats->get_dram_rd(),
-                                power_stats->get_dram_wr(),
-                                power_stats->get_dram_pre());
+    wrapper->set_mem_ctrl_power(power_stats->get_dram_rd(0),
+                                power_stats->get_dram_wr(0),
+                                power_stats->get_dram_pre(0));
 
     // Execution pipeline accesses
     // FPU (SP) accesses, Integer ALU (not present in Tesla), Sfu accesses
-    wrapper->set_exec_unit_power(power_stats->get_tot_fpu_accessess(),
-                                 power_stats->get_ialu_accessess(),
-                                 power_stats->get_tot_sfu_accessess());
+
+    wrapper->set_int_accesses(power_stats->get_ialu_accessess(0), 
+                              power_stats->get_intmul24_accessess(0), 
+                              power_stats->get_intmul32_accessess(0), 
+                              power_stats->get_intmul_accessess(0), 
+                              power_stats->get_intdiv_accessess(0));
+
+    wrapper->set_dp_accesses(power_stats->get_dp_accessess(0), 
+                              power_stats->get_dpmul_accessess(0), 
+                              power_stats->get_dpdiv_accessess(0));
+
+    wrapper->set_fp_accesses(power_stats->get_fp_accessess(0), 
+                            power_stats->get_fpmul_accessess(0), 
+                            power_stats->get_fpdiv_accessess(0));
+
+    wrapper->set_trans_accesses(power_stats->get_sqrt_accessess(0), 
+                                power_stats->get_log_accessess(0), 
+                                power_stats->get_sin_accessess(0), 
+                                power_stats->get_exp_accessess(0));
+
+    wrapper->set_tensor_accesses(power_stats->get_tensor_accessess(0));
+
+    wrapper->set_tex_accesses(power_stats->get_tex_accessess(0));
+
+    wrapper->set_exec_unit_power(power_stats->get_tot_fpu_accessess(0),
+                                 power_stats->get_ialu_accessess(0),
+                                 power_stats->get_tot_sfu_accessess(0));
+
+    wrapper->set_avg_active_threads(power_stats->get_active_threads(0));
 
     // Average active lanes for sp and sfu pipelines
     float avg_sp_active_lanes =
         (power_stats->get_sp_active_lanes()) / stat_sample_freq;
     float avg_sfu_active_lanes =
         (power_stats->get_sfu_active_lanes()) / stat_sample_freq;
+    if(avg_sp_active_lanes >32.0 )
+      avg_sp_active_lanes = 32.0;
+    if(avg_sfu_active_lanes >32.0 )
+      avg_sfu_active_lanes = 32.0;
     assert(avg_sp_active_lanes <= 32);
     assert(avg_sfu_active_lanes <= 32);
-    wrapper->set_active_lanes_power(
-        (power_stats->get_sp_active_lanes()) / stat_sample_freq,
-        (power_stats->get_sfu_active_lanes()) / stat_sample_freq);
+    wrapper->set_active_lanes_power(avg_sp_active_lanes, avg_sfu_active_lanes);
 
     double n_icnt_simt_to_mem =
         (double)
-            power_stats->get_icnt_simt_to_mem();  // # flits from SIMT clusters
+            power_stats->get_icnt_simt_to_mem(0);  // # flits from SIMT clusters
                                                   // to memory partitions
     double n_icnt_mem_to_simt =
         (double)
-            power_stats->get_icnt_mem_to_simt();  // # flits from memory
+            power_stats->get_icnt_mem_to_simt(0);  // # flits from memory
                                                   // partitions to SIMT clusters
-    wrapper->set_NoC_power(
-        n_icnt_mem_to_simt,
-        n_icnt_simt_to_mem);  // Number of flits traversing the interconnect
+    wrapper->set_NoC_power(n_icnt_mem_to_simt + n_icnt_simt_to_mem);  // Number of flits traversing the interconnect
 
     wrapper->compute();
 
@@ -152,3 +189,336 @@ void mcpat_cycle(const gpgpu_sim_config &config,
 void mcpat_reset_perf_count(class gpgpu_sim_wrapper *wrapper) {
   wrapper->reset_counters();
 }
+
+bool parse_hw_file(char* hwpowerfile, bool find_target_kernel, vector<string> &hw_data, char* benchname, std::string executed_kernelname){
+  fstream hw_file;
+  hw_file.open(hwpowerfile, ios::in);
+  string line, word, temp;
+  while(!hw_file.eof()){
+    hw_data.clear();
+    getline(hw_file, line);
+    stringstream s(line);
+    while (getline(s,word,',')){
+      hw_data.push_back(word);
+    }
+    if(hw_data[HW_BENCH_NAME] == std::string(benchname)){
+      if(find_target_kernel){
+        if(hw_data[HW_KERNEL_NAME] == ""){
+          hw_file.close();
+          return true;
+        }
+        else{
+          if(hw_data[HW_KERNEL_NAME] == executed_kernelname){
+            hw_file.close();
+            return true;
+          }
+        }
+      }
+      else{
+        hw_file.close();
+        return true;
+      }
+    } 
+  }
+  hw_file.close();
+  return false;
+}
+
+
+void calculate_hw_mcpat(const gpgpu_sim_config &config,
+                 const shader_core_config *shdr_config,
+                 class gpgpu_sim_wrapper *wrapper,
+                 class power_stat_t *power_stats, unsigned stat_sample_freq,
+                 unsigned tot_cycle, unsigned cycle, unsigned tot_inst,
+                 unsigned inst, int power_simulation_mode, bool dvfs_enabled, char* hwpowerfile, 
+                 char* benchname, std::string executed_kernelname, 
+                 const bool *accelwattch_hybrid_configuration, bool aggregate_power_stats){
+
+  /* Reading HW data from CSV file */
+
+  vector<string> hw_data;
+  bool kernel_found = false;
+  kernel_found = parse_hw_file(hwpowerfile, true, hw_data, benchname, executed_kernelname); //Searching for matching executed_kernelname.
+  if(!kernel_found)
+    kernel_found = parse_hw_file(hwpowerfile, false, hw_data, benchname, executed_kernelname); //Searching for any kernel with same benchname. 
+  assert("Could not find perf stats for the target benchmark in hwpowerfile.\n" && (kernel_found));
+  unsigned perf_cycles = static_cast<unsigned int>(std::stod(hw_data[HW_CYCLES]) + 0.5);
+  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_CYCLES]))
+    perf_cycles = cycle;
+  wrapper->init_mcpat_hw_mode(perf_cycles); //total PERF MODEL cycles for current kernel
+
+  if(dvfs_enabled){
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_VOLTAGE])) 
+      wrapper->set_model_voltage(1); //performance model needs to support this
+    else  
+      wrapper->set_model_voltage(std::stod(hw_data[HW_VOLTAGE])); //performance model needs to support this
+  }
+
+  double l1_read_hits = std::stod(hw_data[HW_L1_RH]);
+  double l1_read_misses = std::stod(hw_data[HW_L1_RM]);
+  double l1_write_hits = std::stod(hw_data[HW_L1_WH]);
+  double l1_write_misses = std::stod(hw_data[HW_L1_WM]);
+
+  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L1_RH]))
+    l1_read_hits = power_stats->get_l1d_read_hits(1) - power_stats->l1r_hits_kernel;
+  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L1_RM]))
+    l1_read_misses = power_stats->get_l1d_read_misses(1) - power_stats->l1r_misses_kernel;
+  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L1_WH]))
+    l1_write_hits = power_stats->get_l1d_write_hits(1) - power_stats->l1w_hits_kernel;
+  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L1_WM]))
+    l1_write_misses = power_stats->get_l1d_write_misses(1) - power_stats->l1w_misses_kernel;
+
+    if(aggregate_power_stats){
+      power_stats->tot_inst_execution += power_stats->get_total_inst(1);
+      power_stats->tot_int_inst_execution +=  power_stats->get_total_int_inst(1);
+      power_stats->tot_fp_inst_execution +=  power_stats->get_total_fp_inst(1);
+      power_stats->commited_inst_execution += power_stats->get_committed_inst(1);
+      wrapper->set_inst_power(
+        shdr_config->gpgpu_clock_gated_lanes, cycle, //TODO: core.[0] cycles counts don't matter, remove this
+        cycle, power_stats->tot_inst_execution,
+        power_stats->tot_int_inst_execution, power_stats->tot_fp_inst_execution,
+        l1_read_hits + l1_read_misses,
+        l1_write_hits + l1_write_misses,
+        power_stats->commited_inst_execution);
+    }
+    else{
+    wrapper->set_inst_power(
+        shdr_config->gpgpu_clock_gated_lanes, cycle, //TODO: core.[0] cycles counts don't matter, remove this
+        cycle, power_stats->get_total_inst(1),
+        power_stats->get_total_int_inst(1), power_stats->get_total_fp_inst(1),
+        l1_read_hits + l1_read_misses,
+        l1_write_hits + l1_write_misses,
+        power_stats->get_committed_inst(1));
+    }
+
+    // Single RF for both int and fp ops -- activity factor set to 0 for Accelwattch HW and Accelwattch Hybrid because no HW Perf Stats for register files
+    wrapper->set_regfile_power(power_stats->get_regfile_reads(1),
+                               power_stats->get_regfile_writes(1),
+                               power_stats->get_non_regfile_operands(1));
+
+    // Instruction cache stats -- activity factor set to 0 for Accelwattch HW and Accelwattch Hybrid because no HW Perf Stats for instruction cache
+    wrapper->set_icache_power(power_stats->get_inst_c_hits(1) - power_stats->l1i_hits_kernel,
+                              power_stats->get_inst_c_misses(1) - power_stats->l1i_misses_kernel);
+
+    // Constant Cache, shared memory, texture cache
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_CC_ACC]))
+      wrapper->set_ccache_power(power_stats->get_const_accessess(1) - power_stats->cc_accesses_kernel, 0); //assuming all HITS in constant cache for now
+    else  
+      wrapper->set_ccache_power(std::stod(hw_data[HW_CC_ACC]), 0); //assuming all HITS in constant cache for now
+
+    
+    // wrapper->set_tcache_power(power_stats->get_texture_c_hits(),
+    //                           power_stats->get_texture_c_misses());
+
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_SHRD_ACC]))
+      wrapper->set_shrd_mem_power(power_stats->get_shmem_access(1) - power_stats->shared_accesses_kernel);
+    else  
+      wrapper->set_shrd_mem_power(std::stod(hw_data[HW_SHRD_ACC]));
+
+    wrapper->set_l1cache_power( l1_read_hits,  l1_read_misses, l1_write_hits,  l1_write_misses);
+
+    double l2_read_hits = std::stod(hw_data[HW_L2_RH]);
+    double l2_read_misses = std::stod(hw_data[HW_L2_RM]);
+    double l2_write_hits = std::stod(hw_data[HW_L2_WH]);
+    double l2_write_misses = std::stod(hw_data[HW_L2_WM]);
+
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L2_RH]))
+      l2_read_hits = power_stats->get_l2_read_hits(1) - power_stats->l2r_hits_kernel;
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L2_RM]))
+      l2_read_misses = power_stats->get_l2_read_misses(1)  - power_stats->l2r_misses_kernel;
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L2_WH]))
+      l2_write_hits = power_stats->get_l2_write_hits(1) - power_stats->l2w_hits_kernel;
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L2_WM]))
+      l2_write_misses = power_stats->get_l2_write_misses(1) - power_stats->l2w_misses_kernel;
+
+    wrapper->set_l2cache_power(l2_read_hits, l2_read_misses, l2_write_hits, l2_write_misses);
+    
+    float active_sms = (*power_stats->m_active_sms) / stat_sample_freq;
+    float num_cores = shdr_config->num_shader();
+    float num_idle_core = num_cores - active_sms;
+    wrapper->set_num_cores(num_cores);
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_NUM_SM_IDLE]))
+      wrapper->set_idle_core_power(num_idle_core);
+    else 
+      wrapper->set_idle_core_power(std::stod(hw_data[HW_NUM_SM_IDLE])); 
+
+    float pipeline_duty_cycle =
+        ((*power_stats->m_average_pipeline_duty_cycle / (stat_sample_freq)) <
+         0.8)
+            ? ((*power_stats->m_average_pipeline_duty_cycle) / stat_sample_freq)
+            : 0.8;
+    
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_PIPE_DUTY]))
+      wrapper->set_duty_cycle_power(pipeline_duty_cycle);
+    else
+      wrapper->set_duty_cycle_power(std::stod(hw_data[HW_PIPE_DUTY]));
+
+    // Memory Controller
+  
+    double dram_reads = std::stod(hw_data[HW_DRAM_RD]);
+    double dram_writes = std::stod(hw_data[HW_DRAM_WR]);
+    double dram_pre = 0;
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_DRAM_RD]))
+      dram_reads = power_stats->get_dram_rd(1) - power_stats->dram_rd_kernel;
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_DRAM_WR]))
+      dram_writes = power_stats->get_dram_wr(1) - power_stats->dram_wr_kernel;
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_DRAM_RD]))
+      dram_pre = power_stats->get_dram_pre(1) - power_stats->dram_pre_kernel;
+
+
+    wrapper->set_mem_ctrl_power(dram_reads, dram_writes, dram_pre);
+
+    if(aggregate_power_stats){
+      power_stats->ialu_acc_execution += power_stats->get_ialu_accessess(1);
+      power_stats->imul24_acc_execution += power_stats->get_intmul24_accessess(1);
+      power_stats->imul32_acc_execution += power_stats->get_intmul32_accessess(1);
+      power_stats->imul_acc_execution += power_stats->get_intmul_accessess(1);
+      power_stats->idiv_acc_execution += power_stats->get_intdiv_accessess(1);
+      power_stats->dp_acc_execution += power_stats->get_dp_accessess(1);
+      power_stats->dpmul_acc_execution += power_stats->get_dpmul_accessess(1);
+      power_stats->dpdiv_acc_execution += power_stats->get_dpdiv_accessess(1);
+      power_stats->fp_acc_execution += power_stats->get_fp_accessess(1);
+      power_stats->fpmul_acc_execution += power_stats->get_fpmul_accessess(1);
+      power_stats->fpdiv_acc_execution += power_stats->get_fpdiv_accessess(1);
+      power_stats->sqrt_acc_execution += power_stats->get_sqrt_accessess(1);
+      power_stats->log_acc_execution += power_stats->get_log_accessess(1);
+      power_stats->sin_acc_execution += power_stats->get_sin_accessess(1);
+      power_stats->exp_acc_execution += power_stats->get_exp_accessess(1);
+      power_stats->tensor_acc_execution += power_stats->get_tensor_accessess(1);
+      power_stats->tex_acc_execution += power_stats->get_tex_accessess(1);
+      power_stats->tot_fpu_acc_execution += power_stats->get_tot_fpu_accessess(1);
+      power_stats->tot_sfu_acc_execution += power_stats->get_tot_sfu_accessess(1);
+      power_stats->tot_threads_acc_execution += power_stats->get_tot_threads_kernel(1);
+      power_stats->tot_warps_acc_execution += power_stats->get_tot_warps_kernel(1);
+      
+      power_stats->sp_active_lanes_execution += (power_stats->get_sp_active_lanes() * shdr_config->num_shader() * shdr_config->gpgpu_num_sp_units);
+      power_stats->sfu_active_lanes_execution += (power_stats->get_sfu_active_lanes() * shdr_config->num_shader() * shdr_config->gpgpu_num_sp_units);
+
+      wrapper->set_int_accesses(power_stats->ialu_acc_execution, 
+                                power_stats->imul24_acc_execution, 
+                                power_stats->imul32_acc_execution, 
+                                power_stats->imul_acc_execution, 
+                                power_stats->idiv_acc_execution);
+
+      wrapper->set_dp_accesses(power_stats->dp_acc_execution, 
+                                power_stats->dpmul_acc_execution, 
+                                power_stats->dpdiv_acc_execution);
+
+      wrapper->set_fp_accesses(power_stats->fp_acc_execution, 
+                              power_stats->fpmul_acc_execution, 
+                              power_stats->fpdiv_acc_execution);
+
+      wrapper->set_trans_accesses(power_stats->sqrt_acc_execution, 
+                                  power_stats->log_acc_execution, 
+                                  power_stats->sin_acc_execution, 
+                                  power_stats->exp_acc_execution);
+
+      wrapper->set_tensor_accesses(power_stats->tensor_acc_execution);
+
+      wrapper->set_tex_accesses(power_stats->tex_acc_execution);
+
+      wrapper->set_exec_unit_power(power_stats->ialu_acc_execution,
+                                   power_stats->tot_fpu_acc_execution,
+                                   power_stats->tot_sfu_acc_execution);
+
+      wrapper->set_avg_active_threads((double)((double)power_stats->tot_threads_acc_execution / (double)power_stats->tot_warps_acc_execution));
+
+      // Average active lanes for sp and sfu pipelines
+      float avg_sp_active_lanes =
+          (power_stats->sp_active_lanes_execution) / shdr_config->num_shader() / shdr_config->gpgpu_num_sp_units / stat_sample_freq;
+      float avg_sfu_active_lanes =
+          (power_stats->sfu_active_lanes_execution) / shdr_config->num_shader() / shdr_config->gpgpu_num_sp_units / stat_sample_freq;
+      if(avg_sp_active_lanes >32.0 )
+        avg_sp_active_lanes = 32.0;
+      if(avg_sfu_active_lanes >32.0 )
+        avg_sfu_active_lanes = 32.0;
+      assert(avg_sp_active_lanes <= 32);
+      assert(avg_sfu_active_lanes <= 32);
+      wrapper->set_active_lanes_power(avg_sp_active_lanes, avg_sfu_active_lanes);
+    }
+    else{
+      wrapper->set_int_accesses(power_stats->get_ialu_accessess(1), 
+                                power_stats->get_intmul24_accessess(1), 
+                                power_stats->get_intmul32_accessess(1), 
+                                power_stats->get_intmul_accessess(1), 
+                                power_stats->get_intdiv_accessess(1));
+
+      wrapper->set_dp_accesses(power_stats->get_dp_accessess(1), 
+                                power_stats->get_dpmul_accessess(1), 
+                                power_stats->get_dpdiv_accessess(1));
+
+      wrapper->set_fp_accesses(power_stats->get_fp_accessess(1), 
+                              power_stats->get_fpmul_accessess(1), 
+                              power_stats->get_fpdiv_accessess(1));
+
+      wrapper->set_trans_accesses(power_stats->get_sqrt_accessess(1), 
+                                  power_stats->get_log_accessess(1), 
+                                  power_stats->get_sin_accessess(1), 
+                                  power_stats->get_exp_accessess(1));
+
+      wrapper->set_tensor_accesses(power_stats->get_tensor_accessess(1));
+
+      wrapper->set_tex_accesses(power_stats->get_tex_accessess(1));
+
+      wrapper->set_exec_unit_power(power_stats->get_tot_fpu_accessess(1),
+                                   power_stats->get_ialu_accessess(1),
+                                   power_stats->get_tot_sfu_accessess(1));
+
+      wrapper->set_avg_active_threads(power_stats->get_active_threads(1));
+
+      // Average active lanes for sp and sfu pipelines
+      float avg_sp_active_lanes =
+          (power_stats->get_sp_active_lanes()) / stat_sample_freq;
+      float avg_sfu_active_lanes =
+          (power_stats->get_sfu_active_lanes()) / stat_sample_freq;
+      if(avg_sp_active_lanes >32.0 )
+        avg_sp_active_lanes = 32.0;
+      if(avg_sfu_active_lanes >32.0 )
+        avg_sfu_active_lanes = 32.0;
+      assert(avg_sp_active_lanes <= 32);
+      assert(avg_sfu_active_lanes <= 32);
+      wrapper->set_active_lanes_power(avg_sp_active_lanes, avg_sfu_active_lanes);
+    }
+
+  
+    double n_icnt_simt_to_mem =
+      (double)
+          (power_stats->get_icnt_simt_to_mem(1) - power_stats->noc_tr_kernel);  // # flits from SIMT clusters
+                                                // to memory partitions
+    double n_icnt_mem_to_simt =
+      (double)
+          (power_stats->get_icnt_mem_to_simt(1)- power_stats->noc_rc_kernel);  // # flits from memory
+                                                // partitions to SIMT clusters
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_NOC]))   
+      wrapper->set_NoC_power(n_icnt_mem_to_simt + n_icnt_simt_to_mem);  // Number of flits traversing the interconnect from Accel-Sim
+    else
+      wrapper->set_NoC_power(std::stod(hw_data[HW_NOC]));  // Number of flits traversing the interconnect from HW
+   
+    wrapper->compute();
+
+    wrapper->update_components_power();
+
+    wrapper->power_metrics_calculations();
+
+    wrapper->dump();
+    power_stats->l1r_hits_kernel = power_stats->get_l1d_read_hits(1);
+    power_stats->l1r_misses_kernel = power_stats->get_l1d_read_misses(1);
+    power_stats->l1w_hits_kernel = power_stats->get_l1d_write_hits(1);
+    power_stats->l1w_misses_kernel = power_stats->get_l1d_write_misses(1);
+    power_stats->shared_accesses_kernel = power_stats->get_const_accessess(1);
+    power_stats->cc_accesses_kernel = power_stats->get_shmem_access(1);
+    power_stats->dram_rd_kernel = power_stats->get_dram_rd(1);
+    power_stats->dram_wr_kernel = power_stats->get_dram_wr(1);
+    power_stats->dram_pre_kernel = power_stats->get_dram_pre(1);
+    power_stats->l1i_hits_kernel = power_stats->get_inst_c_hits(1);
+    power_stats->l1i_misses_kernel = power_stats->get_inst_c_misses(1);
+    power_stats->l2r_hits_kernel = power_stats->get_l2_read_hits(1);
+    power_stats->l2r_misses_kernel = power_stats->get_l2_read_misses(1);
+    power_stats->l2w_hits_kernel =  power_stats->get_l2_write_hits(1); 
+    power_stats->l2w_misses_kernel = power_stats->get_l2_write_misses(1);
+    power_stats->noc_tr_kernel = power_stats->get_icnt_simt_to_mem(1);
+    power_stats->noc_rc_kernel =  power_stats->get_icnt_mem_to_simt(1);
+
+
+    power_stats->clear();
+}
\ No newline at end of file
diff --git a/src/gpgpu-sim/power_interface.h b/src/gpgpu-sim/power_interface.h
index 2bfd4d504..1a488948c 100644
--- a/src/gpgpu-sim/power_interface.h
+++ b/src/gpgpu-sim/power_interface.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -43,7 +44,19 @@ void mcpat_cycle(const gpgpu_sim_config &config,
                  class gpgpu_sim_wrapper *wrapper,
                  class power_stat_t *power_stats, unsigned stat_sample_freq,
                  unsigned tot_cycle, unsigned cycle, unsigned tot_inst,
-                 unsigned inst);
+                 unsigned inst, bool dvfs_enabled);
+
+void calculate_hw_mcpat(const gpgpu_sim_config &config,
+                 const shader_core_config *shdr_config,
+                 class gpgpu_sim_wrapper *wrapper,
+                 class power_stat_t *power_stats, unsigned stat_sample_freq,
+                 unsigned tot_cycle, unsigned cycle, unsigned tot_inst,
+                 unsigned inst, int power_simulation_mode, bool dvfs_enabled, 
+                 char* hwpowerfile, char* benchname, std::string executed_kernelname, 
+                 const bool *accelwattch_hybrid_configuration, bool aggregate_power_stats);
+
+bool parse_hw_file(char* hwpowerfile, bool find_target_kernel, vector<string> &hw_data, char* benchname, std::string executed_kernelname);
+
 void mcpat_reset_perf_count(class gpgpu_sim_wrapper *wrapper);
 
 #endif /* POWER_INTERFACE_H_ */
diff --git a/src/gpgpu-sim/power_stat.cc b/src/gpgpu-sim/power_stat.cc
index 7b60ddf84..fd7a77560 100644
--- a/src/gpgpu-sim/power_stat.cc
+++ b/src/gpgpu-sim/power_stat.cc
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington
-// The University of British Columbia
+// Copyright (c) 2009-2021,  Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -54,10 +55,64 @@ power_mem_stat_t::power_mem_stat_t(const memory_config *mem_config,
   init();
 }
 
+void power_stat_t::clear(){
+  for(unsigned i=0; i< NUM_STAT_IDX; ++i){
+    pwr_mem_stat->core_cache_stats[i].clear();
+    pwr_mem_stat->l2_cache_stats[i].clear();
+    for(unsigned j=0; j<m_config->num_shader(); ++j){
+      pwr_core_stat->m_pipeline_duty_cycle[i][j]=0;                
+      pwr_core_stat->m_num_decoded_insn[i][j]=0;
+      pwr_core_stat->m_num_FPdecoded_insn[i][j]=0;
+      pwr_core_stat->m_num_INTdecoded_insn[i][j]=0;
+      pwr_core_stat->m_num_storequeued_insn[i][j]=0;
+      pwr_core_stat->m_num_loadqueued_insn[i][j]=0;
+      pwr_core_stat->m_num_tex_inst[i][j]=0;
+      pwr_core_stat->m_num_ialu_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_fp_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_imul_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_imul24_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_imul32_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_fpmul_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_idiv_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_fpdiv_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_dp_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_dpmul_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_dpdiv_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_tensor_core_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_const_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_tex_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_sp_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_sfu_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_sqrt_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_log_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_sin_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_exp_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_mem_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_sp_committed[i][j]=0;
+      pwr_core_stat->m_num_sfu_committed[i][j]=0;
+      pwr_core_stat->m_num_mem_committed[i][j]=0;
+      pwr_core_stat->m_read_regfile_acesses[i][j]=0;
+      pwr_core_stat->m_write_regfile_acesses[i][j]=0;
+      pwr_core_stat->m_non_rf_operands[i][j]=0;
+      pwr_core_stat->m_active_sp_lanes[i][j]=0;
+      pwr_core_stat->m_active_sfu_lanes[i][j]=0;
+      pwr_core_stat->m_active_exu_threads[i][j]=0;                   
+      pwr_core_stat->m_active_exu_warps[i][j]=0;
+    }
+    for (unsigned j = 0; j < m_mem_config->m_n_mem; ++j) {
+      pwr_mem_stat->n_rd[i][j]=0;
+      pwr_mem_stat->n_wr[i][j]=0;
+      pwr_mem_stat->n_pre[i][j]=0;
+    }
+  }
+}
+
+
+
 void power_mem_stat_t::init() {
-  shmem_read_access[CURRENT_STAT_IDX] =
+  shmem_access[CURRENT_STAT_IDX] =
       m_core_stats->gpgpu_n_shmem_bank_access;  // Shared memory access
-  shmem_read_access[PREV_STAT_IDX] =
+  shmem_access[PREV_STAT_IDX] =
       (unsigned *)calloc(m_core_config->num_shader(), sizeof(unsigned));
 
   for (unsigned i = 0; i < NUM_STAT_IDX; ++i) {
@@ -71,6 +126,7 @@ void power_mem_stat_t::init() {
     n_pre[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
     n_rd[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
     n_wr[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
+    n_wr_WB[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
     n_req[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
 
     // Interconnect stats
@@ -86,8 +142,8 @@ void power_mem_stat_t::save_stats() {
   l2_cache_stats[PREV_STAT_IDX] = l2_cache_stats[CURRENT_STAT_IDX];
 
   for (unsigned i = 0; i < m_core_config->num_shader(); ++i) {
-    shmem_read_access[PREV_STAT_IDX][i] =
-        shmem_read_access[CURRENT_STAT_IDX][i];  // Shared memory access
+    shmem_access[PREV_STAT_IDX][i] =
+        shmem_access[CURRENT_STAT_IDX][i];  // Shared memory access
   }
 
   for (unsigned i = 0; i < m_config->m_n_mem; ++i) {
@@ -98,6 +154,7 @@ void power_mem_stat_t::save_stats() {
     n_pre[PREV_STAT_IDX][i] = n_pre[CURRENT_STAT_IDX][i];
     n_rd[PREV_STAT_IDX][i] = n_rd[CURRENT_STAT_IDX][i];
     n_wr[PREV_STAT_IDX][i] = n_wr[CURRENT_STAT_IDX][i];
+    n_wr_WB[PREV_STAT_IDX][i] = n_wr_WB[CURRENT_STAT_IDX][i];
     n_req[PREV_STAT_IDX][i] = n_req[CURRENT_STAT_IDX][i];
   }
 
@@ -117,7 +174,7 @@ void power_mem_stat_t::print(FILE *fout) const {
   unsigned total_mem_writes = 0;
   for (unsigned i = 0; i < m_config->m_n_mem; ++i) {
     total_mem_reads += n_rd[CURRENT_STAT_IDX][i];
-    total_mem_writes += n_wr[CURRENT_STAT_IDX][i];
+    total_mem_writes += n_wr[CURRENT_STAT_IDX][i] + n_wr_WB[CURRENT_STAT_IDX][i];
   }
   fprintf(fout, "Total memory controller accesses: %u\n",
           total_mem_reads + total_mem_writes);
@@ -147,198 +204,165 @@ void power_core_stat_t::print(FILE *fout) {
   // per core statistics
   fprintf(fout, "Power Metrics: \n");
   for (unsigned i = 0; i < m_config->num_shader(); i++) {
-    fprintf(fout, "core %u:\n", i);
-    fprintf(fout, "\tpipeline duty cycle =%f\n",
-            m_pipeline_duty_cycle[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal Deocded Instructions=%u\n",
-            m_num_decoded_insn[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal FP Deocded Instructions=%u\n",
-            m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal INT Deocded Instructions=%u\n",
-            m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal LOAD Queued Instructions=%u\n",
-            m_num_loadqueued_insn[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal STORE Queued Instructions=%u\n",
-            m_num_storequeued_insn[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IALU Acesses=%u\n",
-            m_num_ialu_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal FP Acesses=%u\n",
-            m_num_fp_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IMUL Acesses=%u\n",
-            m_num_imul_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IMUL24 Acesses=%u\n",
-            m_num_imul24_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IMUL32 Acesses=%u\n",
-            m_num_imul32_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IDIV Acesses=%u\n",
-            m_num_idiv_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal FPMUL Acesses=%u\n",
-            m_num_fpmul_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal SFU Acesses=%u\n",
-            m_num_trans_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal FPDIV Acesses=%u\n",
-            m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal SFU Acesses=%u\n",
-            m_num_sfu_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal SP Acesses=%u\n",
-            m_num_sp_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal MEM Acesses=%u\n",
-            m_num_mem_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal SFU Commissions=%u\n",
-            m_num_sfu_committed[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal SP Commissions=%u\n",
-            m_num_sp_committed[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal MEM Commissions=%u\n",
-            m_num_mem_committed[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal REG Reads=%u\n",
-            m_read_regfile_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal REG Writes=%u\n",
-            m_write_regfile_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal NON REG=%u\n",
-            m_non_rf_operands[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"core %u:\n",i);
+        fprintf(fout,"\tpipeline duty cycle =%f\n",m_pipeline_duty_cycle[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal Deocded Instructions=%u\n",m_num_decoded_insn[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal FP Deocded Instructions=%u\n",m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal INT Deocded Instructions=%u\n",m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal LOAD Queued Instructions=%u\n",m_num_loadqueued_insn[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal STORE Queued Instructions=%u\n",m_num_storequeued_insn[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal IALU Acesses=%f\n",m_num_ialu_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal FP Acesses=%f\n",m_num_fp_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal DP Acesses=%f\n",m_num_dp_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal IMUL Acesses=%f\n",m_num_imul_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal IMUL24 Acesses=%f\n",m_num_imul24_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal IMUL32 Acesses=%f\n",m_num_imul32_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal IDIV Acesses=%f\n",m_num_idiv_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal FPMUL Acesses=%f\n",m_num_fpmul_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal DPMUL Acesses=%f\n",m_num_dpmul_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal SQRT Acesses=%f\n",m_num_sqrt_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal LOG Acesses=%f\n",m_num_log_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal SIN Acesses=%f\n",m_num_sin_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal EXP Acesses=%f\n",m_num_exp_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal FPDIV Acesses=%f\n",m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal DPDIV Acesses=%f\n",m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal TENSOR Acesses=%f\n",m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal CONST Acesses=%f\n",m_num_const_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal TEX Acesses=%f\n",m_num_tex_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal SFU Acesses=%f\n",m_num_sfu_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal SP Acesses=%f\n",m_num_sp_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal MEM Acesses=%f\n",m_num_mem_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal SFU Commissions=%u\n",m_num_sfu_committed[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal SP Commissions=%u\n",m_num_sp_committed[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal MEM Commissions=%u\n",m_num_mem_committed[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal REG Reads=%u\n",m_read_regfile_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal REG Writes=%u\n",m_write_regfile_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal NON REG=%u\n",m_non_rf_operands[CURRENT_STAT_IDX][i]);
   }
 }
 void power_core_stat_t::init() {
-  m_pipeline_duty_cycle[CURRENT_STAT_IDX] = m_core_stats->m_pipeline_duty_cycle;
-  m_num_decoded_insn[CURRENT_STAT_IDX] = m_core_stats->m_num_decoded_insn;
-  m_num_FPdecoded_insn[CURRENT_STAT_IDX] = m_core_stats->m_num_FPdecoded_insn;
-  m_num_INTdecoded_insn[CURRENT_STAT_IDX] = m_core_stats->m_num_INTdecoded_insn;
-  m_num_storequeued_insn[CURRENT_STAT_IDX] =
-      m_core_stats->m_num_storequeued_insn;
-  m_num_loadqueued_insn[CURRENT_STAT_IDX] = m_core_stats->m_num_loadqueued_insn;
-  m_num_ialu_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_ialu_acesses;
-  m_num_fp_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_fp_acesses;
-  m_num_imul_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_imul_acesses;
-  m_num_imul24_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_imul24_acesses;
-  m_num_imul32_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_imul32_acesses;
-  m_num_fpmul_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_fpmul_acesses;
-  m_num_idiv_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_idiv_acesses;
-  m_num_fpdiv_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_fpdiv_acesses;
-  m_num_sp_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_sp_acesses;
-  m_num_sfu_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_sfu_acesses;
-  m_num_trans_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_trans_acesses;
-  m_num_mem_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_mem_acesses;
-  m_num_sp_committed[CURRENT_STAT_IDX] = m_core_stats->m_num_sp_committed;
-  m_num_sfu_committed[CURRENT_STAT_IDX] = m_core_stats->m_num_sfu_committed;
-  m_num_mem_committed[CURRENT_STAT_IDX] = m_core_stats->m_num_mem_committed;
-  m_read_regfile_acesses[CURRENT_STAT_IDX] =
-      m_core_stats->m_read_regfile_acesses;
-  m_write_regfile_acesses[CURRENT_STAT_IDX] =
-      m_core_stats->m_write_regfile_acesses;
-  m_non_rf_operands[CURRENT_STAT_IDX] = m_core_stats->m_non_rf_operands;
-  m_active_sp_lanes[CURRENT_STAT_IDX] = m_core_stats->m_active_sp_lanes;
-  m_active_sfu_lanes[CURRENT_STAT_IDX] = m_core_stats->m_active_sfu_lanes;
-  m_num_tex_inst[CURRENT_STAT_IDX] = m_core_stats->m_num_tex_inst;
+    m_pipeline_duty_cycle[CURRENT_STAT_IDX]=m_core_stats->m_pipeline_duty_cycle;
+    m_num_decoded_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_decoded_insn;
+    m_num_FPdecoded_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_FPdecoded_insn;
+    m_num_INTdecoded_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_INTdecoded_insn;
+    m_num_storequeued_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_storequeued_insn;
+    m_num_loadqueued_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_loadqueued_insn;
+    m_num_ialu_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_ialu_acesses;
+    m_num_fp_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_fp_acesses;
+    m_num_imul_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_imul_acesses;
+    m_num_imul24_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_imul24_acesses;
+    m_num_imul32_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_imul32_acesses;
+    m_num_fpmul_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_fpmul_acesses;
+    m_num_idiv_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_idiv_acesses;
+    m_num_fpdiv_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_fpdiv_acesses;
+    m_num_dp_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_dp_acesses;
+    m_num_dpmul_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_dpmul_acesses;
+    m_num_dpdiv_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_dpdiv_acesses;
+    m_num_sp_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_sp_acesses;
+    m_num_sfu_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_sfu_acesses;
+    m_num_sqrt_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_sqrt_acesses;
+    m_num_log_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_log_acesses;
+    m_num_sin_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_sin_acesses;
+    m_num_exp_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_exp_acesses;
+    m_num_tensor_core_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_tensor_core_acesses;
+    m_num_const_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_const_acesses;
+    m_num_tex_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_tex_acesses;
+    m_num_mem_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_mem_acesses;
+    m_num_sp_committed[CURRENT_STAT_IDX]=m_core_stats->m_num_sp_committed;
+    m_num_sfu_committed[CURRENT_STAT_IDX]=m_core_stats->m_num_sfu_committed;
+    m_num_mem_committed[CURRENT_STAT_IDX]=m_core_stats->m_num_mem_committed;
+    m_read_regfile_acesses[CURRENT_STAT_IDX]=m_core_stats->m_read_regfile_acesses;
+    m_write_regfile_acesses[CURRENT_STAT_IDX]=m_core_stats->m_write_regfile_acesses;
+    m_non_rf_operands[CURRENT_STAT_IDX]=m_core_stats->m_non_rf_operands;
+    m_active_sp_lanes[CURRENT_STAT_IDX]=m_core_stats->m_active_sp_lanes;
+    m_active_sfu_lanes[CURRENT_STAT_IDX]=m_core_stats->m_active_sfu_lanes;
+    m_active_exu_threads[CURRENT_STAT_IDX]=m_core_stats->m_active_exu_threads;
+    m_active_exu_warps[CURRENT_STAT_IDX]=m_core_stats->m_active_exu_warps;
+    m_num_tex_inst[CURRENT_STAT_IDX]=m_core_stats->m_num_tex_inst;
+
+    m_pipeline_duty_cycle[PREV_STAT_IDX]=(float*)calloc(m_config->num_shader(),sizeof(float));
+    m_num_decoded_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_FPdecoded_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_INTdecoded_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_storequeued_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_loadqueued_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_tex_inst[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+
+    m_num_ialu_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_fp_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_imul_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_imul24_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_imul32_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_fpmul_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_idiv_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_fpdiv_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_dp_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_dpmul_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_dpdiv_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_tensor_core_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_const_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_tex_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_sp_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_sfu_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_sqrt_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_log_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_sin_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_exp_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_mem_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_sp_committed[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_sfu_committed[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_mem_committed[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_read_regfile_acesses[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_write_regfile_acesses[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_non_rf_operands[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_active_sp_lanes[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_active_sfu_lanes[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_active_exu_threads[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_active_exu_warps[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+
 
-  m_pipeline_duty_cycle[PREV_STAT_IDX] =
-      (float *)calloc(m_config->num_shader(), sizeof(float));
-  m_num_decoded_insn[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_FPdecoded_insn[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_INTdecoded_insn[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_storequeued_insn[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_loadqueued_insn[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_ialu_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_fp_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_tex_inst[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_imul_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_imul24_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_imul32_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_fpmul_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_idiv_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_fpdiv_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_sp_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_sfu_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_trans_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_mem_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_sp_committed[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_sfu_committed[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_mem_committed[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_read_regfile_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_write_regfile_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_non_rf_operands[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_active_sp_lanes[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_active_sfu_lanes[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
 }
 
 void power_core_stat_t::save_stats() {
   for (unsigned i = 0; i < m_config->num_shader(); ++i) {
-    m_pipeline_duty_cycle[PREV_STAT_IDX][i] =
-        m_pipeline_duty_cycle[CURRENT_STAT_IDX][i];
-    m_num_decoded_insn[PREV_STAT_IDX][i] =
-        m_num_decoded_insn[CURRENT_STAT_IDX][i];
-    m_num_FPdecoded_insn[PREV_STAT_IDX][i] =
-        m_num_FPdecoded_insn[CURRENT_STAT_IDX][i];
-    m_num_INTdecoded_insn[PREV_STAT_IDX][i] =
-        m_num_INTdecoded_insn[CURRENT_STAT_IDX][i];
-    m_num_storequeued_insn[PREV_STAT_IDX][i] =
-        m_num_storequeued_insn[CURRENT_STAT_IDX][i];
-    m_num_loadqueued_insn[PREV_STAT_IDX][i] =
-        m_num_loadqueued_insn[CURRENT_STAT_IDX][i];
-    m_num_ialu_acesses[PREV_STAT_IDX][i] =
-        m_num_ialu_acesses[CURRENT_STAT_IDX][i];
-    m_num_fp_acesses[PREV_STAT_IDX][i] = m_num_fp_acesses[CURRENT_STAT_IDX][i];
-    m_num_tex_inst[PREV_STAT_IDX][i] = m_num_tex_inst[CURRENT_STAT_IDX][i];
-    m_num_imul_acesses[PREV_STAT_IDX][i] =
-        m_num_imul_acesses[CURRENT_STAT_IDX][i];
-    m_num_imul24_acesses[PREV_STAT_IDX][i] =
-        m_num_imul24_acesses[CURRENT_STAT_IDX][i];
-    m_num_imul32_acesses[PREV_STAT_IDX][i] =
-        m_num_imul32_acesses[CURRENT_STAT_IDX][i];
-    m_num_fpmul_acesses[PREV_STAT_IDX][i] =
-        m_num_fpmul_acesses[CURRENT_STAT_IDX][i];
-    m_num_idiv_acesses[PREV_STAT_IDX][i] =
-        m_num_idiv_acesses[CURRENT_STAT_IDX][i];
-    m_num_fpdiv_acesses[PREV_STAT_IDX][i] =
-        m_num_fpdiv_acesses[CURRENT_STAT_IDX][i];
-    m_num_sp_acesses[PREV_STAT_IDX][i] = m_num_sp_acesses[CURRENT_STAT_IDX][i];
-    m_num_sfu_acesses[PREV_STAT_IDX][i] =
-        m_num_sfu_acesses[CURRENT_STAT_IDX][i];
-    m_num_trans_acesses[PREV_STAT_IDX][i] =
-        m_num_trans_acesses[CURRENT_STAT_IDX][i];
-    m_num_mem_acesses[PREV_STAT_IDX][i] =
-        m_num_mem_acesses[CURRENT_STAT_IDX][i];
-    m_num_sp_committed[PREV_STAT_IDX][i] =
-        m_num_sp_committed[CURRENT_STAT_IDX][i];
-    m_num_sfu_committed[PREV_STAT_IDX][i] =
-        m_num_sfu_committed[CURRENT_STAT_IDX][i];
-    m_num_mem_committed[PREV_STAT_IDX][i] =
-        m_num_mem_committed[CURRENT_STAT_IDX][i];
-    m_read_regfile_acesses[PREV_STAT_IDX][i] =
-        m_read_regfile_acesses[CURRENT_STAT_IDX][i];
-    m_write_regfile_acesses[PREV_STAT_IDX][i] =
-        m_write_regfile_acesses[CURRENT_STAT_IDX][i];
-    m_non_rf_operands[PREV_STAT_IDX][i] =
-        m_non_rf_operands[CURRENT_STAT_IDX][i];
-    m_active_sp_lanes[PREV_STAT_IDX][i] =
-        m_active_sp_lanes[CURRENT_STAT_IDX][i];
-    m_active_sfu_lanes[PREV_STAT_IDX][i] =
-        m_active_sfu_lanes[CURRENT_STAT_IDX][i];
+    m_pipeline_duty_cycle[PREV_STAT_IDX][i]=m_pipeline_duty_cycle[CURRENT_STAT_IDX][i];
+    m_num_decoded_insn[PREV_STAT_IDX][i]= m_num_decoded_insn[CURRENT_STAT_IDX][i];
+    m_num_FPdecoded_insn[PREV_STAT_IDX][i]=m_num_FPdecoded_insn[CURRENT_STAT_IDX][i];
+    m_num_INTdecoded_insn[PREV_STAT_IDX][i]=m_num_INTdecoded_insn[CURRENT_STAT_IDX][i];
+    m_num_storequeued_insn[PREV_STAT_IDX][i]=m_num_storequeued_insn[CURRENT_STAT_IDX][i];
+    m_num_loadqueued_insn[PREV_STAT_IDX][i]=m_num_loadqueued_insn[CURRENT_STAT_IDX][i];
+    m_num_ialu_acesses[PREV_STAT_IDX][i]=m_num_ialu_acesses[CURRENT_STAT_IDX][i];
+    m_num_fp_acesses[PREV_STAT_IDX][i]=m_num_fp_acesses[CURRENT_STAT_IDX][i];
+    m_num_tex_inst[PREV_STAT_IDX][i]=m_num_tex_inst[CURRENT_STAT_IDX][i];
+    m_num_imul_acesses[PREV_STAT_IDX][i]=m_num_imul_acesses[CURRENT_STAT_IDX][i];
+    m_num_imul24_acesses[PREV_STAT_IDX][i]=m_num_imul24_acesses[CURRENT_STAT_IDX][i];
+    m_num_imul32_acesses[PREV_STAT_IDX][i]=m_num_imul32_acesses[CURRENT_STAT_IDX][i];
+    m_num_fpmul_acesses[PREV_STAT_IDX][i]=m_num_fpmul_acesses[CURRENT_STAT_IDX][i];
+    m_num_idiv_acesses[PREV_STAT_IDX][i]=m_num_idiv_acesses[CURRENT_STAT_IDX][i];
+    m_num_fpdiv_acesses[PREV_STAT_IDX][i]=m_num_fpdiv_acesses[CURRENT_STAT_IDX][i];
+    m_num_sp_acesses[PREV_STAT_IDX][i]=m_num_sp_acesses[CURRENT_STAT_IDX][i];
+    m_num_sfu_acesses[PREV_STAT_IDX][i]=m_num_sfu_acesses[CURRENT_STAT_IDX][i];
+    m_num_sqrt_acesses[PREV_STAT_IDX][i]=m_num_sqrt_acesses[CURRENT_STAT_IDX][i];
+    m_num_log_acesses[PREV_STAT_IDX][i]=m_num_log_acesses[CURRENT_STAT_IDX][i];
+    m_num_sin_acesses[PREV_STAT_IDX][i]=m_num_sin_acesses[CURRENT_STAT_IDX][i];
+    m_num_exp_acesses[PREV_STAT_IDX][i]=m_num_exp_acesses[CURRENT_STAT_IDX][i];
+    m_num_dp_acesses[PREV_STAT_IDX][i]=m_num_dp_acesses[CURRENT_STAT_IDX][i];
+    m_num_dpmul_acesses[PREV_STAT_IDX][i]=m_num_dpmul_acesses[CURRENT_STAT_IDX][i];
+    m_num_dpdiv_acesses[PREV_STAT_IDX][i]=m_num_dpdiv_acesses[CURRENT_STAT_IDX][i];
+    m_num_tensor_core_acesses[PREV_STAT_IDX][i]=m_num_tensor_core_acesses[CURRENT_STAT_IDX][i];
+    m_num_const_acesses[PREV_STAT_IDX][i]=m_num_const_acesses[CURRENT_STAT_IDX][i];
+    m_num_tex_acesses[PREV_STAT_IDX][i]=m_num_tex_acesses[CURRENT_STAT_IDX][i];
+    m_num_mem_acesses[PREV_STAT_IDX][i]=m_num_mem_acesses[CURRENT_STAT_IDX][i];
+    m_num_sp_committed[PREV_STAT_IDX][i]=m_num_sp_committed[CURRENT_STAT_IDX][i];
+    m_num_sfu_committed[PREV_STAT_IDX][i]=m_num_sfu_committed[CURRENT_STAT_IDX][i];
+    m_num_mem_committed[PREV_STAT_IDX][i]=m_num_mem_committed[CURRENT_STAT_IDX][i];
+    m_read_regfile_acesses[PREV_STAT_IDX][i]=m_read_regfile_acesses[CURRENT_STAT_IDX][i];
+    m_write_regfile_acesses[PREV_STAT_IDX][i]=m_write_regfile_acesses[CURRENT_STAT_IDX][i];
+    m_non_rf_operands[PREV_STAT_IDX][i]=m_non_rf_operands[CURRENT_STAT_IDX][i];
+    m_active_sp_lanes[PREV_STAT_IDX][i]=m_active_sp_lanes[CURRENT_STAT_IDX][i];
+    m_active_sfu_lanes[PREV_STAT_IDX][i]=m_active_sfu_lanes[CURRENT_STAT_IDX][i];
+    m_active_exu_threads[PREV_STAT_IDX][i]=m_active_exu_threads[CURRENT_STAT_IDX][i];
+    m_active_exu_warps[PREV_STAT_IDX][i]=m_active_exu_warps[CURRENT_STAT_IDX][i];
   }
 }
 
@@ -356,6 +380,51 @@ power_stat_t::power_stat_t(const shader_core_config *shader_config,
   m_active_sms = active_sms;
   m_config = shader_config;
   m_mem_config = mem_config;
+  l1r_hits_kernel = 0;
+  l1r_misses_kernel = 0;
+  l1w_hits_kernel = 0;
+  l1w_misses_kernel = 0;
+  shared_accesses_kernel = 0;
+  cc_accesses_kernel = 0;
+  dram_rd_kernel = 0;
+  dram_wr_kernel = 0;
+  dram_pre_kernel = 0;
+  l1i_hits_kernel =0;
+  l1i_misses_kernel =0;
+  l2r_hits_kernel =0;
+  l2r_misses_kernel =0;
+  l2w_hits_kernel =0;
+  l2w_misses_kernel =0;
+  noc_tr_kernel = 0;
+  noc_rc_kernel = 0;
+
+  tot_inst_execution = 0;
+  tot_int_inst_execution = 0;
+  tot_fp_inst_execution = 0;
+  commited_inst_execution = 0;
+  ialu_acc_execution = 0;
+  imul24_acc_execution = 0;
+  imul32_acc_execution = 0;
+  imul_acc_execution = 0;
+  idiv_acc_execution = 0;
+  dp_acc_execution = 0;
+  dpmul_acc_execution = 0;
+  dpdiv_acc_execution = 0;
+  fp_acc_execution = 0;
+  fpmul_acc_execution = 0;
+  fpdiv_acc_execution = 0;
+  sqrt_acc_execution = 0;
+  log_acc_execution = 0;
+  sin_acc_execution = 0;
+  exp_acc_execution = 0;
+  tensor_acc_execution = 0;
+  tex_acc_execution = 0;
+  tot_fpu_acc_execution = 0;
+  tot_sfu_acc_execution = 0;
+  tot_threads_acc_execution = 0;
+  tot_warps_acc_execution = 0;
+  sp_active_lanes_execution = 0;
+  sfu_active_lanes_execution = 0;
 }
 
 void power_stat_t::visualizer_print(gzFile visualizer_file) {
diff --git a/src/gpgpu-sim/power_stat.h b/src/gpgpu-sim/power_stat.h
index c469db3b3..e2c3ed5cc 100644
--- a/src/gpgpu-sim/power_stat.h
+++ b/src/gpgpu-sim/power_stat.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington
-// The University of British Columbia
+// Copyright (c) 2009-2021,  Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -51,29 +52,40 @@ struct shader_core_power_stats_pod {
   unsigned
       *m_num_INTdecoded_insn[NUM_STAT_IDX];  // number of instructions committed
                                              // by this shader core
-  unsigned *m_num_storequeued_insn[NUM_STAT_IDX];
-  unsigned *m_num_loadqueued_insn[NUM_STAT_IDX];
-  unsigned *m_num_ialu_acesses[NUM_STAT_IDX];
-  unsigned *m_num_fp_acesses[NUM_STAT_IDX];
-  unsigned *m_num_tex_inst[NUM_STAT_IDX];
-  unsigned *m_num_imul_acesses[NUM_STAT_IDX];
-  unsigned *m_num_imul32_acesses[NUM_STAT_IDX];
-  unsigned *m_num_imul24_acesses[NUM_STAT_IDX];
-  unsigned *m_num_fpmul_acesses[NUM_STAT_IDX];
-  unsigned *m_num_idiv_acesses[NUM_STAT_IDX];
-  unsigned *m_num_fpdiv_acesses[NUM_STAT_IDX];
-  unsigned *m_num_sp_acesses[NUM_STAT_IDX];
-  unsigned *m_num_sfu_acesses[NUM_STAT_IDX];
-  unsigned *m_num_trans_acesses[NUM_STAT_IDX];
-  unsigned *m_num_mem_acesses[NUM_STAT_IDX];
-  unsigned *m_num_sp_committed[NUM_STAT_IDX];
-  unsigned *m_num_sfu_committed[NUM_STAT_IDX];
-  unsigned *m_num_mem_committed[NUM_STAT_IDX];
-  unsigned *m_active_sp_lanes[NUM_STAT_IDX];
-  unsigned *m_active_sfu_lanes[NUM_STAT_IDX];
-  unsigned *m_read_regfile_acesses[NUM_STAT_IDX];
-  unsigned *m_write_regfile_acesses[NUM_STAT_IDX];
-  unsigned *m_non_rf_operands[NUM_STAT_IDX];
+    unsigned *m_num_storequeued_insn[NUM_STAT_IDX];
+    unsigned *m_num_loadqueued_insn[NUM_STAT_IDX];
+    unsigned *m_num_tex_inst[NUM_STAT_IDX];
+    double *m_num_ialu_acesses[NUM_STAT_IDX];
+    double *m_num_fp_acesses[NUM_STAT_IDX];
+    double *m_num_imul_acesses[NUM_STAT_IDX];
+    double *m_num_imul32_acesses[NUM_STAT_IDX];
+    double *m_num_imul24_acesses[NUM_STAT_IDX];
+    double *m_num_fpmul_acesses[NUM_STAT_IDX];
+    double *m_num_idiv_acesses[NUM_STAT_IDX];
+    double *m_num_fpdiv_acesses[NUM_STAT_IDX];
+    double *m_num_dp_acesses[NUM_STAT_IDX];
+    double *m_num_dpmul_acesses[NUM_STAT_IDX];
+    double *m_num_dpdiv_acesses[NUM_STAT_IDX];
+    double *m_num_sp_acesses[NUM_STAT_IDX];
+    double *m_num_sfu_acesses[NUM_STAT_IDX];
+    double *m_num_sqrt_acesses[NUM_STAT_IDX];
+    double *m_num_log_acesses[NUM_STAT_IDX];
+    double *m_num_sin_acesses[NUM_STAT_IDX];
+    double *m_num_exp_acesses[NUM_STAT_IDX];
+    double *m_num_tensor_core_acesses[NUM_STAT_IDX];
+    double *m_num_const_acesses[NUM_STAT_IDX];
+    double *m_num_tex_acesses[NUM_STAT_IDX];
+    double *m_num_mem_acesses[NUM_STAT_IDX];
+    unsigned *m_num_sp_committed[NUM_STAT_IDX];
+    unsigned *m_num_sfu_committed[NUM_STAT_IDX];
+    unsigned *m_num_mem_committed[NUM_STAT_IDX];
+    unsigned *m_active_sp_lanes[NUM_STAT_IDX];
+    unsigned *m_active_sfu_lanes[NUM_STAT_IDX];
+    double *m_active_exu_threads[NUM_STAT_IDX];
+    double *m_active_exu_warps[NUM_STAT_IDX];    
+    unsigned *m_read_regfile_acesses[NUM_STAT_IDX];
+    unsigned *m_write_regfile_acesses[NUM_STAT_IDX];
+    unsigned *m_non_rf_operands[NUM_STAT_IDX];
 };
 
 class power_core_stat_t : public shader_core_power_stats_pod {
@@ -84,6 +96,7 @@ class power_core_stat_t : public shader_core_power_stats_pod {
   void print(FILE *fout);
   void init();
   void save_stats();
+ 
 
  private:
   shader_core_stats *m_core_stats;
@@ -96,8 +109,7 @@ struct mem_power_stats_pod {
   class cache_stats core_cache_stats[NUM_STAT_IDX];  // Total core stats
   class cache_stats l2_cache_stats[NUM_STAT_IDX];    // Total L2 partition stats
 
-  unsigned *shmem_read_access[NUM_STAT_IDX];  // Shared memory access
-
+  unsigned *shmem_access[NUM_STAT_IDX];  // Shared memory access
   // Low level DRAM stats
   unsigned *n_cmd[NUM_STAT_IDX];
   unsigned *n_activity[NUM_STAT_IDX];
@@ -106,6 +118,7 @@ struct mem_power_stats_pod {
   unsigned *n_pre[NUM_STAT_IDX];
   unsigned *n_rd[NUM_STAT_IDX];
   unsigned *n_wr[NUM_STAT_IDX];
+  unsigned *n_wr_WB[NUM_STAT_IDX];
   unsigned *n_req[NUM_STAT_IDX];
 
   // Interconnect stats
@@ -144,34 +157,88 @@ class power_stat_t {
     *m_average_pipeline_duty_cycle = 0;
     *m_active_sms = 0;
   }
-
-  unsigned get_total_inst() {
-    unsigned total_inst = 0;
+  void clear();
+  unsigned l1i_misses_kernel;
+  unsigned l1i_hits_kernel;
+  unsigned long long l1r_hits_kernel;
+  unsigned long long l1r_misses_kernel;
+  unsigned long long l1w_hits_kernel;
+  unsigned long long l1w_misses_kernel;
+  unsigned long long shared_accesses_kernel;
+  unsigned long long cc_accesses_kernel;
+  unsigned long long dram_rd_kernel;
+  unsigned long long dram_wr_kernel;
+  unsigned long long dram_pre_kernel;
+  unsigned long long l2r_hits_kernel;
+  unsigned long long l2r_misses_kernel;
+  unsigned long long l2w_hits_kernel;
+  unsigned long long l2w_misses_kernel;
+  unsigned long long noc_tr_kernel;
+  unsigned long long noc_rc_kernel;
+  unsigned long long tot_inst_execution;
+  unsigned long long tot_int_inst_execution;
+  unsigned long long tot_fp_inst_execution;
+  unsigned long long commited_inst_execution;
+  unsigned long long ialu_acc_execution;
+  unsigned long long imul24_acc_execution;
+  unsigned long long imul32_acc_execution;
+  unsigned long long imul_acc_execution;
+  unsigned long long idiv_acc_execution;
+  unsigned long long dp_acc_execution;
+  unsigned long long dpmul_acc_execution;
+  unsigned long long dpdiv_acc_execution;
+  unsigned long long fp_acc_execution;
+  unsigned long long fpmul_acc_execution;
+  unsigned long long fpdiv_acc_execution;
+  unsigned long long sqrt_acc_execution;
+  unsigned long long log_acc_execution;
+  unsigned long long sin_acc_execution;
+  unsigned long long exp_acc_execution;
+  unsigned long long tensor_acc_execution;
+  unsigned long long tex_acc_execution;
+  unsigned long long tot_fpu_acc_execution;
+  unsigned long long tot_sfu_acc_execution;
+  unsigned long long tot_threads_acc_execution;
+  unsigned long long tot_warps_acc_execution;
+  unsigned long long sp_active_lanes_execution;
+  unsigned long long sfu_active_lanes_execution;
+  double get_total_inst(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_decoded_insn[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_decoded_insn[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_decoded_insn[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_decoded_insn[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_total_int_inst() {
-    unsigned total_inst = 0;
+  double get_total_int_inst(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst +=
+      if(aggregate_stat)
+          total_inst +=
+          (pwr_core_stat->m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]);
+      else 
+        total_inst +=
           (pwr_core_stat->m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]) -
           (pwr_core_stat->m_num_INTdecoded_insn[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_total_fp_inst() {
-    unsigned total_inst = 0;
+  double get_total_fp_inst(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]);
+      else 
+        total_inst += (pwr_core_stat->m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_FPdecoded_insn[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_total_load_inst() {
-    unsigned total_inst = 0;
+  double get_total_load_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst +=
           (pwr_core_stat->m_num_loadqueued_insn[CURRENT_STAT_IDX][i]) -
@@ -179,8 +246,8 @@ class power_stat_t {
     }
     return total_inst;
   }
-  unsigned get_total_store_inst() {
-    unsigned total_inst = 0;
+  double get_total_store_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst +=
           (pwr_core_stat->m_num_storequeued_insn[CURRENT_STAT_IDX][i]) -
@@ -188,34 +255,39 @@ class power_stat_t {
     }
     return total_inst;
   }
-  unsigned get_sp_committed_inst() {
-    unsigned total_inst = 0;
+  double get_sp_committed_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_sp_committed[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_sp_committed[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_sfu_committed_inst() {
-    unsigned total_inst = 0;
+  double get_sfu_committed_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_sfu_committed[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_sfu_committed[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_mem_committed_inst() {
-    unsigned total_inst = 0;
+  double get_mem_committed_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_mem_committed[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_committed_inst() {
-    unsigned total_inst = 0;
+  double get_committed_inst(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_sfu_committed[CURRENT_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_sp_committed[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_mem_committed[PREV_STAT_IDX][i]) +
                     (pwr_core_stat->m_num_sfu_committed[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_sfu_committed[PREV_STAT_IDX][i]) +
@@ -224,19 +296,27 @@ class power_stat_t {
     }
     return total_inst;
   }
-  unsigned get_regfile_reads() {
-    unsigned total_inst = 0;
+  double get_regfile_reads(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst +=
+      if(aggregate_stat)
+         total_inst +=
+          (pwr_core_stat->m_read_regfile_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
           (pwr_core_stat->m_read_regfile_acesses[CURRENT_STAT_IDX][i]) -
           (pwr_core_stat->m_read_regfile_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_regfile_writes() {
-    unsigned total_inst = 0;
+  double get_regfile_writes(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst +=
+      if(aggregate_stat)
+        total_inst +=
+          (pwr_core_stat->m_write_regfile_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
           (pwr_core_stat->m_write_regfile_acesses[CURRENT_STAT_IDX][i]) -
           (pwr_core_stat->m_write_regfile_acesses[PREV_STAT_IDX][i]);
     }
@@ -253,17 +333,20 @@ class power_stat_t {
     return total_inst;
   }
 
-  unsigned get_non_regfile_operands() {
-    unsigned total_inst = 0;
+  double get_non_regfile_operands(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_non_rf_operands[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+         total_inst += (pwr_core_stat->m_non_rf_operands[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_non_rf_operands[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_non_rf_operands[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_sp_accessess() {
-    unsigned total_inst = 0;
+  double get_sp_accessess() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_sp_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_sp_acesses[PREV_STAT_IDX][i]);
@@ -271,25 +354,58 @@ class power_stat_t {
     return total_inst;
   }
 
-  unsigned get_sfu_accessess() {
-    unsigned total_inst = 0;
+  double get_sfu_accessess() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_sfu_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_sfu_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_trans_accessess() {
-    unsigned total_inst = 0;
-    for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_trans_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_trans_acesses[PREV_STAT_IDX][i]);
-    }
-    return total_inst;
+
+  double get_sqrt_accessess(bool aggregate_stat){
+      double total_inst=0;
+      for(unsigned i=0; i<m_config->num_shader();i++){
+          if(aggregate_stat)
+            total_inst+=(pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]);
+          else
+            total_inst+=(pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]) - (pwr_core_stat->m_num_sqrt_acesses[PREV_STAT_IDX][i]);
+      }
+      return total_inst;
+  }
+  double get_log_accessess(bool aggregate_stat){
+      double total_inst=0;
+      for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)
+          total_inst+=(pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]);
+        else 
+          total_inst+=(pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]) - (pwr_core_stat->m_num_log_acesses[PREV_STAT_IDX][i]);
+      }
+      return total_inst;
+  }
+  double get_sin_accessess(bool aggregate_stat){
+      double total_inst=0;
+      for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)  
+          total_inst+=(pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]);
+        else 
+          total_inst+=(pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]) - (pwr_core_stat->m_num_sin_acesses[PREV_STAT_IDX][i]);
+      }
+      return total_inst;
+  }
+  double get_exp_accessess(bool aggregate_stat){
+      double total_inst=0;
+      for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)  
+          total_inst+=(pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]);
+        else  
+          total_inst+=(pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]) - (pwr_core_stat->m_num_exp_acesses[PREV_STAT_IDX][i]);
+      }
+      return total_inst;
   }
 
-  unsigned get_mem_accessess() {
-    unsigned total_inst = 0;
+  double get_mem_accessess() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_mem_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_mem_acesses[PREV_STAT_IDX][i]);
@@ -297,66 +413,164 @@ class power_stat_t {
     return total_inst;
   }
 
-  unsigned get_intdiv_accessess() {
-    unsigned total_inst = 0;
+  double get_intdiv_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_idiv_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_fpdiv_accessess() {
-    unsigned total_inst = 0;
+  double get_fpdiv_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_fpdiv_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_intmul32_accessess() {
-    unsigned total_inst = 0;
+  double get_intmul32_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_imul32_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_intmul24_accessess() {
-    unsigned total_inst = 0;
+  double get_intmul24_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_intmul_accessess() {
-    unsigned total_inst = 0;
-    for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul32_acesses[PREV_STAT_IDX][i]);
+  double get_intmul_accessess(bool aggregate_stat){
+      double total_inst=0;
+      for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)
+          total_inst+= (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]); 
+        else  
+          total_inst+= (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) - 
+                       (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]);
+      }
+      return total_inst;
+  }
+
+  double get_fpmul_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)
+          total_inst += (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]);
+        else
+          total_inst += (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_fpmul_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_fpmul_accessess() {
-    unsigned total_inst = 0;
-    for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_fp_acesses[PREV_STAT_IDX][i]);
+  double get_fp_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)
+          total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]);
+        else  
+          total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_fp_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  float get_sp_active_lanes() {
-    unsigned total_inst = 0;
+  double get_dp_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)
+          total_inst += (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]);
+        else  
+          total_inst += (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_dp_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_dpmul_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+      if(aggregate_stat)  
+        total_inst += (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_dpmul_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_dpdiv_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+      if(aggregate_stat)  
+        total_inst += (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_dpdiv_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_tensor_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+      if(aggregate_stat)  
+        total_inst += (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_tensor_core_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_const_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)
+          total_inst += pwr_core_stat->m_num_const_acesses[CURRENT_STAT_IDX][i];
+        else
+          total_inst += (pwr_core_stat->m_num_const_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_const_acesses[PREV_STAT_IDX][i]);
+    }
+    return (total_inst);
+  }
+
+  double get_tex_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+      if(aggregate_stat)  
+        total_inst += (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_tex_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_sp_active_lanes() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_active_sp_lanes[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_active_sp_lanes[PREV_STAT_IDX][i]);
@@ -365,7 +579,7 @@ class power_stat_t {
   }
 
   float get_sfu_active_lanes() {
-    unsigned total_inst = 0;
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_active_sfu_lanes[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_active_sfu_lanes[PREV_STAT_IDX][i]);
@@ -375,49 +589,141 @@ class power_stat_t {
            m_config->gpgpu_num_sfu_units;
   }
 
-  unsigned get_tot_fpu_accessess() {
-    unsigned total_inst = 0;
+
+  float get_active_threads(bool aggregate_stat) {
+    unsigned total_threads = 0;
+    unsigned total_warps = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if(aggregate_stat){
+        total_threads += (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) ;
+        total_warps += (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]);
+      }
+      else{
+        total_threads += (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) -
+                    (pwr_core_stat->m_active_exu_threads[PREV_STAT_IDX][i]);
+        total_warps += (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]) -
+                    (pwr_core_stat->m_active_exu_warps[PREV_STAT_IDX][i]);
+        }
+    }
+    if(total_warps != 0)
+      return (float)((float)total_threads / (float)total_warps);
+    else
+      return 0;
+  }
+
+  unsigned long long get_tot_threads_kernel(bool aggregate_stat) {
+    unsigned total_threads = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat){
+        total_threads += (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) ;
+      }
+      else{
+        total_threads += (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) -
+                    (pwr_core_stat->m_active_exu_threads[PREV_STAT_IDX][i]);
+        }
+    }
+
+      return total_threads;
+  }
+  unsigned long long get_tot_warps_kernel(bool aggregate_stat) {
+    unsigned long long total_warps = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if(aggregate_stat){
+        total_warps += (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]);
+      }
+      else{
+        total_warps += (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]) -
+                    (pwr_core_stat->m_active_exu_warps[PREV_STAT_IDX][i]);
+        }
+    }
+      return total_warps;
+  }
+
+
+  double get_tot_fpu_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) - 
                     (pwr_core_stat->m_num_fp_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_fpdiv_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_fpmul_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]);
+                    (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_dp_acesses[PREV_STAT_IDX][i]);
     }
-    total_inst +=
-        get_total_load_inst() + get_total_store_inst() + get_tex_inst();
+    //total_inst += get_total_load_inst()+get_total_store_inst()+get_tex_inst();
     return total_inst;
   }
 
-  unsigned get_tot_sfu_accessess() {
-    unsigned total_inst = 0;
-    for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) -
+
+
+  double get_tot_sfu_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]);
+        else
+            total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) - 
                     (pwr_core_stat->m_num_idiv_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
+                    (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) - 
                     (pwr_core_stat->m_num_imul32_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_trans_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_trans_acesses[PREV_STAT_IDX][i]);
+                    (pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_sqrt_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_log_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_sin_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_exp_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_fpdiv_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_fpmul_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_dpmul_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_dpdiv_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_tensor_core_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_tex_acesses[PREV_STAT_IDX][i]);
+
     }
     return total_inst;
   }
 
-  unsigned get_ialu_accessess() {
-    unsigned total_inst = 0;
+  double get_ialu_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_ialu_acesses[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_ialu_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_ialu_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_ialu_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_tex_inst() {
-    unsigned total_inst = 0;
+  double get_tex_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_tex_inst[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_tex_inst[PREV_STAT_IDX][i]);
@@ -425,7 +731,7 @@ class power_stat_t {
     return total_inst;
   }
 
-  unsigned get_constant_c_accesses() {
+  double get_constant_c_accesses() {
     enum mem_access_type access_type[] = {CONST_ACC_R};
     enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
     unsigned num_access_type =
@@ -440,7 +746,7 @@ class power_stat_t {
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_constant_c_misses() {
+  double get_constant_c_misses() {
     enum mem_access_type access_type[] = {CONST_ACC_R};
     enum cache_request_status request_status[] = {MISS};
     unsigned num_access_type =
@@ -455,10 +761,10 @@ class power_stat_t {
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_constant_c_hits() {
+  double get_constant_c_hits() {
     return (get_constant_c_accesses() - get_constant_c_misses());
   }
-  unsigned get_texture_c_accesses() {
+  double get_texture_c_accesses() {
     enum mem_access_type access_type[] = {TEXTURE_ACC_R};
     enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
     unsigned num_access_type =
@@ -473,7 +779,7 @@ class power_stat_t {
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_texture_c_misses() {
+  double get_texture_c_misses() {
     enum mem_access_type access_type[] = {TEXTURE_ACC_R};
     enum cache_request_status request_status[] = {MISS};
     unsigned num_access_type =
@@ -488,205 +794,268 @@ class power_stat_t {
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_texture_c_hits() {
+  double get_texture_c_hits() {
     return (get_texture_c_accesses() - get_texture_c_misses());
   }
-  unsigned get_inst_c_accesses() {
+  double get_inst_c_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {INST_ACC_R};
     enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat)
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    else
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_inst_c_misses() {
+  double get_inst_c_misses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {INST_ACC_R};
     enum cache_request_status request_status[] = {MISS};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat)
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    else
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_inst_c_hits() {
-    return (get_inst_c_accesses() - get_inst_c_misses());
+  double get_inst_c_hits(bool aggregate_stat) {
+    return (get_inst_c_accesses(aggregate_stat) - get_inst_c_misses(aggregate_stat));
   }
 
-  unsigned get_l1d_read_accesses() {
+  double get_l1d_read_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_R, LOCAL_ACC_R};
-    enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
+    enum cache_request_status request_status[] = {HIT, MISS, SECTOR_MISS}; 
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+      }
+  }
+  double get_l1d_read_misses(bool aggregate_stat) {
+    return (get_l1d_read_accesses(aggregate_stat) - get_l1d_read_hits(aggregate_stat));
   }
-  unsigned get_l1d_read_misses() {
+  double get_l1d_read_hits(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_R, LOCAL_ACC_R};
-    enum cache_request_status request_status[] = {MISS};
+    enum cache_request_status request_status[] = {HIT, MSHR_HIT};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+      }
   }
-  unsigned get_l1d_read_hits() {
-    return (get_l1d_read_accesses() - get_l1d_read_misses());
-  }
-  unsigned get_l1d_write_accesses() {
+  double get_l1d_write_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W};
-    enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
+    enum cache_request_status request_status[] = {HIT, MISS, SECTOR_MISS};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+      }
+  }
+  double get_l1d_write_misses(bool aggregate_stat) {
+    return (get_l1d_write_accesses(aggregate_stat) - get_l1d_write_hits(aggregate_stat));
   }
-  unsigned get_l1d_write_misses() {
+  double get_l1d_write_hits(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W};
-    enum cache_request_status request_status[] = {MISS};
+    enum cache_request_status request_status[] = {HIT, MSHR_HIT};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+      }
   }
-  unsigned get_l1d_write_hits() {
-    return (get_l1d_write_accesses() - get_l1d_write_misses());
-  }
-  unsigned get_cache_misses() {
-    return get_l1d_read_misses() + get_constant_c_misses() +
-           get_l1d_write_misses() + get_texture_c_misses();
+  double get_cache_misses() {
+    return get_l1d_read_misses(0) + get_constant_c_misses() +
+           get_l1d_write_misses(0) + get_texture_c_misses();
   }
 
-  unsigned get_cache_read_misses() {
-    return get_l1d_read_misses() + get_constant_c_misses() +
+  double get_cache_read_misses() {
+    return get_l1d_read_misses(0) + get_constant_c_misses() +
            get_texture_c_misses();
   }
 
-  unsigned get_cache_write_misses() { return get_l1d_write_misses(); }
+  double get_cache_write_misses() { return get_l1d_write_misses(0); }
 
-  unsigned get_shmem_read_access() {
+  double get_shmem_access(bool aggregate_stat) {
     unsigned total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_mem_stat->shmem_read_access[CURRENT_STAT_IDX][i]) -
-                    (pwr_mem_stat->shmem_read_access[PREV_STAT_IDX][i]);
+      if(aggregate_stat)
+        total_inst += (pwr_mem_stat->shmem_access[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_mem_stat->shmem_access[CURRENT_STAT_IDX][i]) -
+                    (pwr_mem_stat->shmem_access[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_l2_read_accesses() {
+  unsigned long long  get_l2_read_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {
         GLOBAL_ACC_R, LOCAL_ACC_R, CONST_ACC_R, TEXTURE_ACC_R, INST_ACC_R};
-    enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
+    enum cache_request_status request_status[] = {HIT, HIT_RESERVED, MISS, SECTOR_MISS}; 
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+       return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+    }
   }
 
-  unsigned get_l2_read_misses() {
-    enum mem_access_type access_type[] = {
+  unsigned long long get_l2_read_misses(bool aggregate_stat) {
+    return (get_l2_read_accesses(aggregate_stat) - get_l2_read_hits(aggregate_stat));
+  }
+
+  unsigned long long get_l2_read_hits(bool aggregate_stat) {
+       enum mem_access_type access_type[] = {
         GLOBAL_ACC_R, LOCAL_ACC_R, CONST_ACC_R, TEXTURE_ACC_R, INST_ACC_R};
-    enum cache_request_status request_status[] = {MISS};
+    enum cache_request_status request_status[] =  {HIT, HIT_RESERVED};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+       return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+    }
   }
 
-  unsigned get_l2_read_hits() {
-    return (get_l2_read_accesses() - get_l2_read_misses());
-  }
-
-  unsigned get_l2_write_accesses() {
+  unsigned long long get_l2_write_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W,
                                           L1_WRBK_ACC};
-    enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
+    enum cache_request_status request_status[] = {HIT, HIT_RESERVED, MISS, SECTOR_MISS}; 
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+    }
   }
 
-  unsigned get_l2_write_misses() {
-    enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W,
+  unsigned long long get_l2_write_misses(bool aggregate_stat) {
+    return (get_l2_write_accesses(aggregate_stat) - get_l2_write_hits(aggregate_stat));
+  }
+  unsigned long long get_l2_write_hits(bool aggregate_stat) {
+        enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W,
                                           L1_WRBK_ACC};
-    enum cache_request_status request_status[] = {MISS};
+    enum cache_request_status request_status[] = {HIT, HIT_RESERVED};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+    }
   }
-  unsigned get_l2_write_hits() {
-    return (get_l2_write_accesses() - get_l2_write_misses());
-  }
-  unsigned get_dram_cmd() {
+  double get_dram_cmd() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_cmd[CURRENT_STAT_IDX][i] -
@@ -694,7 +1063,7 @@ class power_stat_t {
     }
     return total;
   }
-  unsigned get_dram_activity() {
+  double get_dram_activity() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_activity[CURRENT_STAT_IDX][i] -
@@ -702,7 +1071,7 @@ class power_stat_t {
     }
     return total;
   }
-  unsigned get_dram_nop() {
+  double get_dram_nop() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_nop[CURRENT_STAT_IDX][i] -
@@ -710,7 +1079,7 @@ class power_stat_t {
     }
     return total;
   }
-  unsigned get_dram_act() {
+  double get_dram_act() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_act[CURRENT_STAT_IDX][i] -
@@ -718,31 +1087,49 @@ class power_stat_t {
     }
     return total;
   }
-  unsigned get_dram_pre() {
+  double get_dram_pre(bool aggregate_stat) {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
-      total += (pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i] -
+      if(aggregate_stat){
+        total += pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i];
+      }
+      else{
+        total += (pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i] -
                 pwr_mem_stat->n_pre[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
-  unsigned get_dram_rd() {
+  double get_dram_rd(bool aggregate_stat) {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
-      total += (pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i] -
+      if(aggregate_stat){
+        total += pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i];
+      }
+      else{
+        total += (pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i] -
                 pwr_mem_stat->n_rd[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
-  unsigned get_dram_wr() {
+  double get_dram_wr(bool aggregate_stat) {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
-      total += (pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i] -
-                pwr_mem_stat->n_wr[PREV_STAT_IDX][i]);
+      if(aggregate_stat){
+        total += pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i] + 
+                pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i];
+      }
+      else{
+        total += (pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i] - 
+                pwr_mem_stat->n_wr[PREV_STAT_IDX][i]) +
+                (pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i] - 
+                pwr_mem_stat->n_wr_WB[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
-  unsigned get_dram_req() {
+  double get_dram_req() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_req[CURRENT_STAT_IDX][i] -
@@ -751,20 +1138,31 @@ class power_stat_t {
     return total;
   }
 
-  long get_icnt_simt_to_mem() {
+  unsigned long long get_icnt_simt_to_mem(bool aggregate_stat) {
     long total = 0;
-    for (unsigned i = 0; i < m_config->n_simt_clusters; ++i) {
-      total += (pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i] -
+    for (unsigned i = 0; i < m_config->n_simt_clusters; ++i){
+      if(aggregate_stat){
+        total += pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i];
+      }
+      else{
+        total += (pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i] -
                 pwr_mem_stat->n_simt_to_mem[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
 
-  long get_icnt_mem_to_simt() {
+  unsigned long long get_icnt_mem_to_simt(bool aggregate_stat) {
     long total = 0;
     for (unsigned i = 0; i < m_config->n_simt_clusters; ++i) {
-      total += (pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i] -
+      if(aggregate_stat){
+        total += pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i];
+      }
+      
+      else{
+        total += (pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i] -
                 pwr_mem_stat->n_mem_to_simt[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index c6e7b8f67..c0161dd31 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
-// George L. Yuan, Andrew Turner, Inderpreet Singh
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
+// George L. Yuan, Andrew Turner, Inderpreet Singh, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -61,6 +62,20 @@ mem_fetch *shader_core_mem_fetch_allocator::alloc(
                     m_core_id, m_cluster_id, m_memory_config, cycle);
   return mf;
 }
+
+mem_fetch *shader_core_mem_fetch_allocator::alloc(
+    new_addr_type addr, mem_access_type type, const active_mask_t &active_mask,
+    const mem_access_byte_mask_t &byte_mask,
+    const mem_access_sector_mask_t &sector_mask, unsigned size, bool wr,
+    unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc,
+    mem_fetch *original_mf) const {
+  mem_access_t access(type, addr, size, wr, active_mask, byte_mask, sector_mask,
+                      m_memory_config->gpgpu_ctx);
+  mem_fetch *mf = new mem_fetch(
+      access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, wid, m_core_id,
+      m_cluster_id, m_memory_config, cycle, original_mf);
+  return mf;
+}
 /////////////////////////////////////////////////////////////////////////////
 
 std::list<unsigned> shader_core_ctx::get_regs_written(const inst_t &fvt) const {
@@ -108,7 +123,7 @@ void shader_core_ctx::create_front_pipeline() {
 
   if (m_config->sub_core_model) {
     // in subcore model, each scheduler should has its own issue register, so
-    // num scheduler = reg width
+    // ensure num scheduler = reg width
     assert(m_config->gpgpu_num_sched_per_core ==
            m_pipeline_reg[ID_OC_SP].get_size());
     assert(m_config->gpgpu_num_sched_per_core ==
@@ -124,6 +139,11 @@ void shader_core_ctx::create_front_pipeline() {
     if (m_config->gpgpu_num_int_units > 0)
       assert(m_config->gpgpu_num_sched_per_core ==
              m_pipeline_reg[ID_OC_INT].get_size());
+    for (int j = 0; j < m_config->m_specialized_unit.size(); j++) {
+      if (m_config->m_specialized_unit[j].num_units > 0)
+        assert(m_config->gpgpu_num_sched_per_core ==
+               m_config->m_specialized_unit[j].id_oc_spec_reg_width);
+    }
   }
 
   m_threadState = (thread_ctx_t *)calloc(sizeof(thread_ctx_t),
@@ -172,6 +192,8 @@ void shader_core_ctx::create_schedulers() {
                 ? CONCRETE_SCHEDULER_TWO_LEVEL_ACTIVE
                 : sched_config.find("gto") != std::string::npos
                       ? CONCRETE_SCHEDULER_GTO
+                      : sched_config.find("rrr") != std::string::npos
+                            ? CONCRETE_SCHEDULER_RRR
                       : sched_config.find("old") != std::string::npos
                             ? CONCRETE_SCHEDULER_OLDEST_FIRST
                             : sched_config.find("warp_limiting") !=
@@ -206,6 +228,14 @@ void shader_core_ctx::create_schedulers() {
             &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg,
             &m_pipeline_reg[ID_OC_MEM], i));
         break;
+      case CONCRETE_SCHEDULER_RRR:
+        schedulers.push_back(new rrr_scheduler(
+            m_stats, this, m_scoreboard, m_simt_stack, &m_warp,
+            &m_pipeline_reg[ID_OC_SP], &m_pipeline_reg[ID_OC_DP],
+            &m_pipeline_reg[ID_OC_SFU], &m_pipeline_reg[ID_OC_INT],
+            &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg,
+            &m_pipeline_reg[ID_OC_MEM], i));
+        break;
       case CONCRETE_SCHEDULER_OLDEST_FIRST:
         schedulers.push_back(new oldest_scheduler(
             m_stats, this, m_scoreboard, m_simt_stack, &m_warp,
@@ -377,41 +407,41 @@ void shader_core_ctx::create_exec_pipeline() {
 
   // m_fu = new simd_function_unit*[m_num_function_units];
 
-  for (int k = 0; k < m_config->gpgpu_num_sp_units; k++) {
-    m_fu.push_back(new sp_unit(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_sp_units; k++) {
+    m_fu.push_back(new sp_unit(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_SP);
     m_issue_port.push_back(OC_EX_SP);
   }
 
-  for (int k = 0; k < m_config->gpgpu_num_dp_units; k++) {
-    m_fu.push_back(new dp_unit(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_dp_units; k++) {
+    m_fu.push_back(new dp_unit(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_DP);
     m_issue_port.push_back(OC_EX_DP);
   }
-  for (int k = 0; k < m_config->gpgpu_num_int_units; k++) {
-    m_fu.push_back(new int_unit(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_int_units; k++) {
+    m_fu.push_back(new int_unit(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_INT);
     m_issue_port.push_back(OC_EX_INT);
   }
 
-  for (int k = 0; k < m_config->gpgpu_num_sfu_units; k++) {
-    m_fu.push_back(new sfu(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_sfu_units; k++) {
+    m_fu.push_back(new sfu(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_SFU);
     m_issue_port.push_back(OC_EX_SFU);
   }
 
-  for (int k = 0; k < m_config->gpgpu_num_tensor_core_units; k++) {
-    m_fu.push_back(new tensor_core(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_tensor_core_units; k++) {
+    m_fu.push_back(new tensor_core(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_TENSOR_CORE);
     m_issue_port.push_back(OC_EX_TENSOR_CORE);
   }
 
-  for (int j = 0; j < m_config->m_specialized_unit.size(); j++) {
+  for (unsigned j = 0; j < m_config->m_specialized_unit.size(); j++) {
     for (unsigned k = 0; k < m_config->m_specialized_unit[j].num_units; k++) {
       m_fu.push_back(new specialized_unit(
           &m_pipeline_reg[EX_WB], m_config, this, SPEC_UNIT_START_ID + j,
           m_config->m_specialized_unit[j].name,
-          m_config->m_specialized_unit[j].latency));
+          m_config->m_specialized_unit[j].latency, k));
       m_dispatch_port.push_back(m_config->m_specialized_unit[j].ID_OC_SPEC_ID);
       m_issue_port.push_back(m_config->m_specialized_unit[j].OC_EX_SPEC_ID);
     }
@@ -456,6 +486,10 @@ shader_core_ctx::shader_core_ctx(class gpgpu_sim *gpu,
   m_sid = shader_id;
   m_tpc = tpc_id;
 
+  if(get_gpu()->get_config().g_power_simulation_enabled){
+    scaling_coeffs =  get_gpu()->get_scaling_coeffs();
+  }
+
   m_last_inst_gpu_sim_cycle = 0;
   m_last_inst_gpu_tot_sim_cycle = 0;
 
@@ -859,7 +893,7 @@ void shader_core_ctx::decode() {
     m_warp[m_inst_fetch_buffer.m_warp_id]->inc_inst_in_pipeline();
     if (pI1) {
       m_stats->m_num_decoded_insn[m_sid]++;
-      if (pI1->oprnd_type == INT_OP) {
+      if ((pI1->oprnd_type == INT_OP) || (pI1->oprnd_type == UN_OP))  { //these counters get added up in mcPat to compute scheduler power
         m_stats->m_num_INTdecoded_insn[m_sid]++;
       } else if (pI1->oprnd_type == FP_OP) {
         m_stats->m_num_FPdecoded_insn[m_sid]++;
@@ -870,7 +904,7 @@ void shader_core_ctx::decode() {
         m_warp[m_inst_fetch_buffer.m_warp_id]->ibuffer_fill(1, pI2);
         m_warp[m_inst_fetch_buffer.m_warp_id]->inc_inst_in_pipeline();
         m_stats->m_num_decoded_insn[m_sid]++;
-        if (pI2->oprnd_type == INT_OP) {
+        if ((pI1->oprnd_type == INT_OP) || (pI1->oprnd_type == UN_OP))  { //these counters get added up in mcPat to compute scheduler power
           m_stats->m_num_INTdecoded_insn[m_sid]++;
         } else if (pI2->oprnd_type == FP_OP) {
           m_stats->m_num_FPdecoded_insn[m_sid]++;
@@ -953,8 +987,10 @@ void shader_core_ctx::fetch() {
               m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
           std::list<cache_event> events;
           enum cache_request_status status;
-          if (m_config->perfect_inst_const_cache)
+          if (m_config->perfect_inst_const_cache){
             status = HIT;
+            shader_cache_access_log(m_sid, INSTRUCTION, 0);
+          }
           else
             status = m_L1I->access(
                 (new_addr_type)ppc, mf,
@@ -1082,6 +1118,33 @@ void scheduler_unit::order_lrr(
   }
 }
 
+template <class T>
+void scheduler_unit::order_rrr(
+    std::vector<T> &result_list, const typename std::vector<T> &input_list,
+    const typename std::vector<T>::const_iterator &last_issued_from_input,
+    unsigned num_warps_to_add) {
+  result_list.clear();
+
+  if (m_num_issued_last_cycle > 0 || warp(m_current_turn_warp).done_exit() ||
+      warp(m_current_turn_warp).waiting()) {
+    std::vector<shd_warp_t *>::const_iterator iter =
+      (last_issued_from_input == input_list.end()) ? 
+        input_list.begin() : last_issued_from_input + 1;
+    for (unsigned count = 0; count < num_warps_to_add; ++iter, ++count) {
+      if (iter == input_list.end()) {
+      iter = input_list.begin();
+      }
+      unsigned warp_id = (*iter)->get_warp_id();
+      if (!(*iter)->done_exit() && !(*iter)->waiting()) {
+        result_list.push_back(*iter);
+        m_current_turn_warp = warp_id;
+        break;
+      }
+    }
+  } else {
+    result_list.push_back(&warp(m_current_turn_warp));
+  }
+}
 /**
  * A general function to order things in an priority-based way.
  * The core usage of the function is similar to order_lrr.
@@ -1228,29 +1291,21 @@ void scheduler_unit::cycle() {
                 previous_issued_inst_exec_type = exec_unit_type_t::MEM;
               }
             } else {
-              bool sp_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_sp_units > 0) &&
-                  m_sp_out->has_free(m_shader->m_config->sub_core_model, m_id);
-              bool sfu_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_sfu_units > 0) &&
-                  m_sfu_out->has_free(m_shader->m_config->sub_core_model, m_id);
-              bool tensor_core_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_tensor_core_units > 0) &&
-                  m_tensor_core_out->has_free(
-                      m_shader->m_config->sub_core_model, m_id);
-              bool dp_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_dp_units > 0) &&
-                  m_dp_out->has_free(m_shader->m_config->sub_core_model, m_id);
-              bool int_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_int_units > 0) &&
-                  m_int_out->has_free(m_shader->m_config->sub_core_model, m_id);
-
               // This code need to be refactored
               if (pI->op != TENSOR_CORE_OP && pI->op != SFU_OP &&
                   pI->op != DP_OP && !(pI->op >= SPEC_UNIT_START_ID)) {
                 bool execute_on_SP = false;
                 bool execute_on_INT = false;
 
+                bool sp_pipe_avail =
+                    (m_shader->m_config->gpgpu_num_sp_units > 0) &&
+                    m_sp_out->has_free(m_shader->m_config->sub_core_model,
+                                       m_id);
+                bool int_pipe_avail =
+                    (m_shader->m_config->gpgpu_num_int_units > 0) &&
+                    m_int_out->has_free(m_shader->m_config->sub_core_model,
+                                        m_id);
+
                 // if INT unit pipline exist, then execute ALU and INT
                 // operations on INT unit and SP-FPU on SP unit (like in Volta)
                 // if INT unit pipline does not exist, then execute all ALU, INT
@@ -1311,6 +1366,11 @@ void scheduler_unit::cycle() {
                          (pI->op == DP_OP) &&
                          !(diff_exec_units && previous_issued_inst_exec_type ==
                                                   exec_unit_type_t::DP)) {
+                bool dp_pipe_avail =
+                    (m_shader->m_config->gpgpu_num_dp_units > 0) &&
+                    m_dp_out->has_free(m_shader->m_config->sub_core_model,
+                                       m_id);
+
                 if (dp_pipe_avail) {
                   m_shader->issue_warp(*m_dp_out, pI, active_mask, warp_id,
                                        m_id);
@@ -1326,6 +1386,11 @@ void scheduler_unit::cycle() {
                         (pI->op == SFU_OP) || (pI->op == ALU_SFU_OP)) &&
                        !(diff_exec_units && previous_issued_inst_exec_type ==
                                                 exec_unit_type_t::SFU)) {
+                bool sfu_pipe_avail =
+                    (m_shader->m_config->gpgpu_num_sfu_units > 0) &&
+                    m_sfu_out->has_free(m_shader->m_config->sub_core_model,
+                                        m_id);
+
                 if (sfu_pipe_avail) {
                   m_shader->issue_warp(*m_sfu_out, pI, active_mask, warp_id,
                                        m_id);
@@ -1337,6 +1402,11 @@ void scheduler_unit::cycle() {
               } else if ((pI->op == TENSOR_CORE_OP) &&
                          !(diff_exec_units && previous_issued_inst_exec_type ==
                                                   exec_unit_type_t::TENSOR)) {
+                bool tensor_core_pipe_avail =
+                    (m_shader->m_config->gpgpu_num_tensor_core_units > 0) &&
+                    m_tensor_core_out->has_free(
+                        m_shader->m_config->sub_core_model, m_id);
+
                 if (tensor_core_pipe_avail) {
                   m_shader->issue_warp(*m_tensor_core_out, pI, active_mask,
                                        warp_id, m_id);
@@ -1407,7 +1477,7 @@ void scheduler_unit::cycle() {
           m_last_supervised_issued = supervised_iter;
         }
       }
-
+      m_num_issued_last_cycle = issued;
       if (issued == 1)
         m_stats->single_issue_nums[m_id]++;
       else if (issued > 1)
@@ -1456,6 +1526,10 @@ void lrr_scheduler::order_warps() {
   order_lrr(m_next_cycle_prioritized_warps, m_supervised_warps,
             m_last_supervised_issued, m_supervised_warps.size());
 }
+void rrr_scheduler::order_warps() {
+  order_rrr(m_next_cycle_prioritized_warps, m_supervised_warps,
+            m_last_supervised_issued, m_supervised_warps.size());
+}
 
 void gto_scheduler::order_warps() {
   order_by_priority(m_next_cycle_prioritized_warps, m_supervised_warps,
@@ -1569,7 +1643,10 @@ void swl_scheduler::order_warps() {
   }
 }
 
-void shader_core_ctx::read_operands() {}
+void shader_core_ctx::read_operands() {
+  for (int i = 0; i < m_config->reg_file_port_throughput; ++i)
+    m_operand_collector.step();
+}
 
 address_type coalesced_segment(address_type addr,
                                unsigned segment_size_lg2bytes) {
@@ -1669,8 +1746,15 @@ void shader_core_ctx::execute() {
     m_fu[n]->active_lanes_in_pipeline();
     unsigned issue_port = m_issue_port[n];
     register_set &issue_inst = m_pipeline_reg[issue_port];
-    warp_inst_t **ready_reg = issue_inst.get_ready();
-    if (issue_inst.has_ready() && m_fu[n]->can_issue(**ready_reg)) {
+    unsigned reg_id;
+    bool partition_issue =
+        m_config->sub_core_model && m_fu[n]->is_issue_partitioned();
+    if (partition_issue) {
+      reg_id = m_fu[n]->get_issue_reg_id();
+    }
+    warp_inst_t **ready_reg = issue_inst.get_ready(partition_issue, reg_id);
+    if (issue_inst.has_ready(partition_issue, reg_id) &&
+        m_fu[n]->can_issue(**ready_reg)) {
       bool schedule_wb_now = !m_fu[n]->stallable();
       int resbus = -1;
       if (schedule_wb_now &&
@@ -1970,6 +2054,21 @@ void ldst_unit::L1_latency_queue_cycle() {
       } else {
         assert(status == MISS || status == HIT_RESERVED);
         l1_latency_queue[j][0] = NULL;
+        if (m_config->m_L1D_config.get_write_policy() != WRITE_THROUGH &&
+            mf_next->get_inst().is_store() &&
+            (m_config->m_L1D_config.get_write_allocate_policy() ==
+                 FETCH_ON_WRITE ||
+             m_config->m_L1D_config.get_write_allocate_policy() ==
+                 LAZY_FETCH_ON_READ) &&
+            !was_writeallocate_sent(events)) {
+          unsigned dec_ack =
+              (m_config->m_L1D_config.get_mshr_type() == SECTOR_ASSOC)
+                  ? (mf_next->get_data_size() / SECTOR_SIZE)
+                  : 1;
+          mf_next->set_reply();
+          for (unsigned i = 0; i < dec_ack; ++i) m_core->store_ack(mf_next);
+          if (!write_sent && !read_sent) delete mf_next;
+        }
       }
     }
 
@@ -2112,22 +2211,32 @@ simd_function_unit::simd_function_unit(const shader_core_config *config) {
   m_dispatch_reg = new warp_inst_t(config);
 }
 
+void simd_function_unit::issue(register_set &source_reg) {
+  bool partition_issue =
+      m_config->sub_core_model && this->is_issue_partitioned();
+  source_reg.move_out_to(partition_issue, this->get_issue_reg_id(),
+                         m_dispatch_reg);
+  occupied.set(m_dispatch_reg->latency);
+}
+
 sfu::sfu(register_set *result_port, const shader_core_config *config,
-         shader_core_ctx *core)
-    : pipelined_simd_unit(result_port, config, config->max_sfu_latency, core) {
+         shader_core_ctx *core, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, config->max_sfu_latency, core,
+                          issue_reg_id) {
   m_name = "SFU";
 }
 
 tensor_core::tensor_core(register_set *result_port,
                          const shader_core_config *config,
-                         shader_core_ctx *core)
+                         shader_core_ctx *core, unsigned issue_reg_id)
     : pipelined_simd_unit(result_port, config, config->max_tensor_core_latency,
-                          core) {
+                          core, issue_reg_id) {
   m_name = "TENSOR_CORE";
 }
 
 void sfu::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
 
   (*ready_reg)->op_pipe = SFU__OP;
@@ -2136,7 +2245,8 @@ void sfu::issue(register_set &source_reg) {
 }
 
 void tensor_core::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
 
   (*ready_reg)->op_pipe = TENSOR_CORE__OP;
@@ -2172,7 +2282,7 @@ void sp_unit::active_lanes_in_pipeline() {
 void dp_unit::active_lanes_in_pipeline() {
   unsigned active_count = pipelined_simd_unit::get_active_lanes_in_pipeline();
   assert(active_count <= m_core->get_config()->warp_size);
-  m_core->incspactivelanes_stat(active_count);
+  //m_core->incspactivelanes_stat(active_count);
   m_core->incfuactivelanes_stat(active_count);
   m_core->incfumemactivelanes_stat(active_count);
 }
@@ -2208,34 +2318,39 @@ void tensor_core::active_lanes_in_pipeline() {
 }
 
 sp_unit::sp_unit(register_set *result_port, const shader_core_config *config,
-                 shader_core_ctx *core)
-    : pipelined_simd_unit(result_port, config, config->max_sp_latency, core) {
+                 shader_core_ctx *core, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, config->max_sp_latency, core,
+                          issue_reg_id) {
   m_name = "SP ";
 }
 
 specialized_unit::specialized_unit(register_set *result_port,
                                    const shader_core_config *config,
                                    shader_core_ctx *core, unsigned supported_op,
-                                   char *unit_name, unsigned latency)
-    : pipelined_simd_unit(result_port, config, latency, core) {
+                                   char *unit_name, unsigned latency,
+                                   unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, latency, core, issue_reg_id) {
   m_name = unit_name;
   m_supported_op = supported_op;
 }
 
 dp_unit::dp_unit(register_set *result_port, const shader_core_config *config,
-                 shader_core_ctx *core)
-    : pipelined_simd_unit(result_port, config, config->max_dp_latency, core) {
+                 shader_core_ctx *core, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, config->max_dp_latency, core,
+                          issue_reg_id) {
   m_name = "DP ";
 }
 
 int_unit::int_unit(register_set *result_port, const shader_core_config *config,
-                   shader_core_ctx *core)
-    : pipelined_simd_unit(result_port, config, config->max_int_latency, core) {
+                   shader_core_ctx *core, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, config->max_int_latency, core,
+                          issue_reg_id) {
   m_name = "INT ";
 }
 
 void sp_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = SP__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2243,7 +2358,8 @@ void sp_unit ::issue(register_set &source_reg) {
 }
 
 void dp_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = DP__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2251,7 +2367,8 @@ void dp_unit ::issue(register_set &source_reg) {
 }
 
 void specialized_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = SPECIALIZED__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2259,7 +2376,8 @@ void specialized_unit ::issue(register_set &source_reg) {
 }
 
 void int_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = INTP__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2269,7 +2387,8 @@ void int_unit ::issue(register_set &source_reg) {
 pipelined_simd_unit::pipelined_simd_unit(register_set *result_port,
                                          const shader_core_config *config,
                                          unsigned max_latency,
-                                         shader_core_ctx *core)
+                                         shader_core_ctx *core,
+                                         unsigned issue_reg_id)
     : simd_function_unit(config) {
   m_result_port = result_port;
   m_pipeline_depth = max_latency;
@@ -2277,6 +2396,7 @@ pipelined_simd_unit::pipelined_simd_unit(register_set *result_port,
   for (unsigned i = 0; i < m_pipeline_depth; i++)
     m_pipeline_reg[i] = new warp_inst_t(config);
   m_core = core;
+  m_issue_reg_id = issue_reg_id;
   active_insts_in_pipeline = 0;
 }
 
@@ -2303,7 +2423,10 @@ void pipelined_simd_unit::cycle() {
 
 void pipelined_simd_unit::issue(register_set &source_reg) {
   // move_warp(m_dispatch_reg,source_reg);
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  bool partition_issue =
+      m_config->sub_core_model && this->is_issue_partitioned();
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(partition_issue, m_issue_reg_id);
   m_core->incexecstat((*ready_reg));
   // source_reg.move_out_to(m_dispatch_reg);
   simd_function_unit::issue(source_reg);
@@ -2360,7 +2483,7 @@ ldst_unit::ldst_unit(mem_fetch_interface *icnt,
                      Scoreboard *scoreboard, const shader_core_config *config,
                      const memory_config *mem_config, shader_core_stats *stats,
                      unsigned sid, unsigned tpc)
-    : pipelined_simd_unit(NULL, config, config->smem_latency, core),
+    : pipelined_simd_unit(NULL, config, config->smem_latency, core, 0),
       m_next_wb(config) {
   assert(config->smem_latency > 1);
   init(icnt, mf_allocator, core, operand_collector, scoreboard, config,
@@ -2388,7 +2511,7 @@ ldst_unit::ldst_unit(mem_fetch_interface *icnt,
                      Scoreboard *scoreboard, const shader_core_config *config,
                      const memory_config *mem_config, shader_core_stats *stats,
                      unsigned sid, unsigned tpc, l1_cache *new_l1d_cache)
-    : pipelined_simd_unit(NULL, config, 3, core),
+    : pipelined_simd_unit(NULL, config, 3, core, 0),
       m_L1D(new_l1d_cache),
       m_next_wb(config) {
   init(icnt, mf_allocator, core, operand_collector, scoreboard, config,
@@ -2550,8 +2673,7 @@ inst->space.get_type() != shared_space) { unsigned warp_id = inst->warp_id();
 */
 void ldst_unit::cycle() {
   writeback();
-  for (int i = 0; i < m_config->reg_file_port_throughput; ++i)
-    m_operand_collector->step();
+
   for (unsigned stage = 0; (stage + 1) < m_pipeline_depth; stage++)
     if (m_pipeline_reg[stage]->empty() && !m_pipeline_reg[stage + 1]->empty())
       move_warp(m_pipeline_reg[stage], m_pipeline_reg[stage + 1]);
@@ -2964,52 +3086,69 @@ void warp_inst_t::print(FILE *fout) const {
   m_config->gpgpu_ctx->func_sim->ptx_print_insn(pc, fout);
   fprintf(fout, "\n");
 }
-void shader_core_ctx::incexecstat(warp_inst_t *&inst) {
-  if (inst->mem_op == TEX) inctex_stat(inst->active_count(), 1);
-
-  // Latency numbers for next operations are used to scale the power values
-  // for special operations, according observations from microbenchmarking
-  // TODO: put these numbers in the xml configuration
-
-  switch (inst->sp_op) {
+void shader_core_ctx::incexecstat(warp_inst_t *&inst)
+{
+    // Latency numbers for next operations are used to scale the power values
+    // for special operations, according observations from microbenchmarking
+    // TODO: put these numbers in the xml configuration
+  if(get_gpu()->get_config().g_power_simulation_enabled){
+    switch(inst->sp_op){
     case INT__OP:
-      incialu_stat(inst->active_count(), 32);
+      incialu_stat(inst->active_count(), scaling_coeffs->int_coeff);
       break;
     case INT_MUL_OP:
-      incimul_stat(inst->active_count(), 7.2);
+      incimul_stat(inst->active_count(), scaling_coeffs->int_mul_coeff);
       break;
     case INT_MUL24_OP:
-      incimul24_stat(inst->active_count(), 4.2);
+      incimul24_stat(inst->active_count(), scaling_coeffs->int_mul24_coeff);
       break;
     case INT_MUL32_OP:
-      incimul32_stat(inst->active_count(), 4);
+      incimul32_stat(inst->active_count(), scaling_coeffs->int_mul32_coeff);
       break;
     case INT_DIV_OP:
-      incidiv_stat(inst->active_count(), 40);
+      incidiv_stat(inst->active_count(), scaling_coeffs->int_div_coeff);
       break;
     case FP__OP:
-      incfpalu_stat(inst->active_count(), 1);
+      incfpalu_stat(inst->active_count(),scaling_coeffs->fp_coeff);
       break;
     case FP_MUL_OP:
-      incfpmul_stat(inst->active_count(), 1.8);
+      incfpmul_stat(inst->active_count(), scaling_coeffs->fp_mul_coeff);
       break;
     case FP_DIV_OP:
-      incfpdiv_stat(inst->active_count(), 48);
+      incfpdiv_stat(inst->active_count(), scaling_coeffs->fp_div_coeff);
+      break;
+    case DP___OP:
+      incdpalu_stat(inst->active_count(), scaling_coeffs->dp_coeff);
+      break;
+    case DP_MUL_OP:
+      incdpmul_stat(inst->active_count(), scaling_coeffs->dp_mul_coeff);
+      break;
+    case DP_DIV_OP:
+      incdpdiv_stat(inst->active_count(), scaling_coeffs->dp_div_coeff);
       break;
     case FP_SQRT_OP:
-      inctrans_stat(inst->active_count(), 25);
+      incsqrt_stat(inst->active_count(), scaling_coeffs->sqrt_coeff);
       break;
     case FP_LG_OP:
-      inctrans_stat(inst->active_count(), 35);
+      inclog_stat(inst->active_count(), scaling_coeffs->log_coeff);
       break;
     case FP_SIN_OP:
-      inctrans_stat(inst->active_count(), 12);
+      incsin_stat(inst->active_count(), scaling_coeffs->sin_coeff);
       break;
     case FP_EXP_OP:
-      inctrans_stat(inst->active_count(), 35);
+      incexp_stat(inst->active_count(), scaling_coeffs->exp_coeff);
+      break;
+    case TENSOR__OP:
+      inctensor_stat(inst->active_count(), scaling_coeffs->tensor_coeff);
+      break;
+    case TEX__OP:
+      inctex_stat(inst->active_count(), scaling_coeffs->tex_coeff);
       break;
     default:
       break;
+    }
+    if(inst->const_cache_operand) //warp has const address space load as one operand
+      inc_const_accesses(1);
   }
 }
 void shader_core_ctx::print_stage(unsigned int stage, FILE *fout) const {
@@ -3264,49 +3403,46 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
   if (adaptive_cache_config && !k.cache_config_set) {
     // For more info about adaptive cache, see
     // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
-    unsigned total_shmed = kernel_info->smem * result;
-    assert(total_shmed >= 0 && total_shmed <= gpgpu_shmem_size);
-    // assert(gpgpu_shmem_size == 98304); //Volta has 96 KB shared
-    // assert(m_L1D_config.get_nset() == 4);  //Volta L1 has four sets
-    if (total_shmed < gpgpu_shmem_size) {
-      switch (adaptive_cache_config) {
-        case FIXED:
-          break;
-        case ADAPTIVE_VOLTA: {
-          // For Volta, we assign the remaining shared memory to L1 cache
-          // For more info about adaptive cache, see
-          // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
-          // assert(gpgpu_shmem_size == 98304); //Volta has 96 KB shared
-
-          // To Do: make it flexible and not tuned to 9KB share memory
-          unsigned max_assoc = m_L1D_config.get_max_assoc();
-          if (total_shmed == 0)
-            m_L1D_config.set_assoc(max_assoc);  // L1 is 128KB and shd=0
-          else if (total_shmed > 0 && total_shmed <= 8192)
-            m_L1D_config.set_assoc(0.9375 *
-                                   max_assoc);  // L1 is 120KB and shd=8KB
-          else if (total_shmed > 8192 && total_shmed <= 16384)
-            m_L1D_config.set_assoc(0.875 *
-                                   max_assoc);  // L1 is 112KB and shd=16KB
-          else if (total_shmed > 16384 && total_shmed <= 32768)
-            m_L1D_config.set_assoc(0.75 * max_assoc);  // L1 is 96KB and
-                                                       // shd=32KB
-          else if (total_shmed > 32768 && total_shmed <= 65536)
-            m_L1D_config.set_assoc(0.5 * max_assoc);  // L1 is 64KB and shd=64KB
-          else if (total_shmed > 65536 && total_shmed <= gpgpu_shmem_size)
-            m_L1D_config.set_assoc(0.25 * max_assoc);  // L1 is 32KB and
-                                                       // shd=96KB
-          else
-            assert(0);
-          break;
-        }
-        default:
-          assert(0);
+    unsigned total_shmem = kernel_info->smem * result;
+    assert(total_shmem >= 0 && total_shmem <= shmem_opt_list.back());
+
+    // Unified cache config is in KB. Converting to B
+    unsigned total_unified = m_L1D_config.m_unified_cache_size * 1024;
+
+    bool l1d_configured = false;
+    unsigned max_assoc = m_L1D_config.get_max_assoc();
+
+    for (std::vector<unsigned>::const_iterator it = shmem_opt_list.begin();
+         it < shmem_opt_list.end(); it++) {
+      if (total_shmem <= *it) {
+        float l1_ratio = 1 - ((float)*(it) / total_unified);
+        // make sure the ratio is between 0 and 1
+        assert(0 <= l1_ratio && l1_ratio <= 1);
+        // round to nearest instead of round down
+        m_L1D_config.set_assoc(max_assoc * l1_ratio + 0.5f);
+        l1d_configured = true;
+        break;
       }
+    }
 
-      printf("GPGPU-Sim: Reconfigure L1 cache to %uKB\n",
-             m_L1D_config.get_total_size_inKB());
+    assert(l1d_configured && "no shared memory option found");
+
+    if (m_L1D_config.is_streaming()) {
+      // for streaming cache, if the whole memory is allocated
+      // to the L1 cache, then make the allocation to be on_MISS
+      // otherwise, make it ON_FILL to eliminate line allocation fails
+      // i.e. MSHR throughput is the same, independent on the L1 cache
+      // size/associativity
+      if (total_shmem == 0) {
+        m_L1D_config.set_allocation_policy(ON_MISS);
+        printf("GPGPU-Sim: Reconfigure L1 allocation to ON_MISS\n");
+      } else {
+        m_L1D_config.set_allocation_policy(ON_FILL);
+        printf("GPGPU-Sim: Reconfigure L1 allocation to ON_FILL\n");
+      }
     }
+    printf("GPGPU-Sim: Reconfigure L1 cache to %uKB\n",
+           m_L1D_config.get_total_size_inKB());
 
     k.cache_config_set = true;
   }
@@ -3867,15 +4003,26 @@ void opndcoll_rfu_t::init(unsigned num_banks, shader_core_ctx *shader) {
   assert((m_bank_warp_shift == 5) || (m_warp_size != 32));
 
   sub_core_model = shader->get_config()->sub_core_model;
-  m_num_warp_sceds = shader->get_config()->gpgpu_num_sched_per_core;
-  if (sub_core_model)
+  m_num_warp_scheds = shader->get_config()->gpgpu_num_sched_per_core;
+  unsigned reg_id;
+  if (sub_core_model) {
     assert(num_banks % shader->get_config()->gpgpu_num_sched_per_core == 0);
+    assert(m_num_warp_scheds <= m_cu.size() &&
+           m_cu.size() % m_num_warp_scheds == 0);
+  }
   m_num_banks_per_sched =
       num_banks / shader->get_config()->gpgpu_num_sched_per_core;
 
   for (unsigned j = 0; j < m_cu.size(); j++) {
+    if (sub_core_model) {
+      unsigned cusPerSched = m_cu.size() / m_num_warp_scheds;
+      reg_id = j / cusPerSched;
+    }
     m_cu[j]->init(j, num_banks, m_bank_warp_shift, shader->get_config(), this,
-                  sub_core_model, m_num_banks_per_sched);
+                  sub_core_model, reg_id, m_num_banks_per_sched);
+  }
+  for (unsigned j = 0; j < m_dispatch_units.size(); j++) {
+    m_dispatch_units[j].init(sub_core_model,m_num_warp_scheds);
   }
   m_initialized = true;
 }
@@ -3974,7 +4121,22 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
       for (unsigned j = 0; j < inp.m_cu_sets.size(); j++) {
         std::vector<collector_unit_t> &cu_set = m_cus[inp.m_cu_sets[j]];
         bool allocated = false;
-        for (unsigned k = 0; k < cu_set.size(); k++) {
+        unsigned cuLowerBound = 0;
+        unsigned cuUpperBound = cu_set.size();
+        unsigned schd_id;
+        if (sub_core_model) {
+          // Sub core model only allocates on the subset of CUs assigned to the
+          // scheduler that issued
+          unsigned reg_id = (*inp.m_in[i]).get_ready_reg_id();
+          schd_id = (*inp.m_in[i]).get_schd_id(reg_id);
+          assert(cu_set.size() % m_num_warp_scheds == 0 &&
+                 cu_set.size() >= m_num_warp_scheds);
+          unsigned cusPerSched = cu_set.size() / m_num_warp_scheds;
+          cuLowerBound = schd_id * cusPerSched;
+          cuUpperBound = cuLowerBound + cusPerSched;
+          assert(0 <= cuLowerBound && cuUpperBound <= cu_set.size());
+        }
+        for (unsigned k = cuLowerBound; k < cuUpperBound; k++) {
           if (cu_set[k].is_free()) {
             collector_unit_t *cu = &cu_set[k];
             allocated = cu->allocate(inp.m_in[i], inp.m_out[i]);
@@ -3984,8 +4146,9 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
         }
         if (allocated) break;  // cu has been allocated, no need to search more.
       }
-      break;  // can only service a single input, if it failed it will fail for
-              // others.
+      // break;  // can only service a single input, if it failed it will fail
+      // for
+      // others.
     }
   }
 }
@@ -4032,7 +4195,8 @@ void opndcoll_rfu_t::allocate_reads() {
 }
 
 bool opndcoll_rfu_t::collector_unit_t::ready() const {
-  return (!m_free) && m_not_ready.none() && (*m_output_register).has_free();
+  return (!m_free) && m_not_ready.none() &&
+         (*m_output_register).has_free(m_sub_core_model, m_reg_id);
 }
 
 void opndcoll_rfu_t::collector_unit_t::dump(
@@ -4050,12 +4214,10 @@ void opndcoll_rfu_t::collector_unit_t::dump(
   }
 }
 
-void opndcoll_rfu_t::collector_unit_t::init(unsigned n, unsigned num_banks,
-                                            unsigned log2_warp_size,
-                                            const core_config *config,
-                                            opndcoll_rfu_t *rfu,
-                                            bool sub_core_model,
-                                            unsigned banks_per_sched) {
+void opndcoll_rfu_t::collector_unit_t::init(
+    unsigned n, unsigned num_banks, unsigned log2_warp_size,
+    const core_config *config, opndcoll_rfu_t *rfu, bool sub_core_model,
+    unsigned reg_id, unsigned banks_per_sched) {
   m_rfu = rfu;
   m_cuid = n;
   m_num_banks = num_banks;
@@ -4063,6 +4225,7 @@ void opndcoll_rfu_t::collector_unit_t::init(unsigned n, unsigned num_banks,
   m_warp = new warp_inst_t(config);
   m_bank_warp_shift = log2_warp_size;
   m_sub_core_model = sub_core_model;
+  m_reg_id = reg_id;
   m_num_banks_per_sched = banks_per_sched;
 }
 
@@ -4097,8 +4260,7 @@ bool opndcoll_rfu_t::collector_unit_t::allocate(register_set *pipeline_reg_set,
 
 void opndcoll_rfu_t::collector_unit_t::dispatch() {
   assert(m_not_ready.none());
-  // move_warp(*m_output_register,m_warp);
-  m_output_register->move_in(m_warp);
+  m_output_register->move_in(m_sub_core_model, m_reg_id, m_warp);
   m_free = true;
   m_output_register = NULL;
   for (unsigned i = 0; i < MAX_REG_OPERANDS * 2; i++) m_src_op[i].reset();
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 6481790bc..65d56251c 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, Andrew Turner,
-// Ali Bakhoda
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Andrew Turner,
+// Ali Bakhoda, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -238,7 +239,10 @@ class shd_warp_t {
   unsigned get_dynamic_warp_id() const { return m_dynamic_warp_id; }
   unsigned get_warp_id() const { return m_warp_id; }
 
-  class shader_core_ctx * get_shader() { return m_shader; }
+  class shader_core_ctx *get_shader() {
+    return m_shader;
+  }
+
  private:
   static const unsigned IBUFFER_SIZE = 2;
   class shader_core_ctx *m_shader;
@@ -318,6 +322,7 @@ enum concrete_scheduler {
   CONCRETE_SCHEDULER_LRR = 0,
   CONCRETE_SCHEDULER_GTO,
   CONCRETE_SCHEDULER_TWO_LEVEL_ACTIVE,
+  CONCRETE_SCHEDULER_RRR,
   CONCRETE_SCHEDULER_WARP_LIMITING,
   CONCRETE_SCHEDULER_OLDEST_FIRST,
   NUM_CONCRETE_SCHEDULERS
@@ -369,6 +374,12 @@ class scheduler_unit {  // this can be copied freely, so can be used in std
       const typename std::vector<T> &input_list,
       const typename std::vector<T>::const_iterator &last_issued_from_input,
       unsigned num_warps_to_add);
+  template <typename T>
+  void order_rrr(
+      typename std::vector<T> &result_list,
+      const typename std::vector<T> &input_list,
+      const typename std::vector<T>::const_iterator &last_issued_from_input,
+      unsigned num_warps_to_add);
 
   enum OrderingType {
     // The item that issued last is prioritized first then the sorted result
@@ -427,6 +438,8 @@ class scheduler_unit {  // this can be copied freely, so can be used in std
   register_set *m_tensor_core_out;
   register_set *m_mem_out;
   std::vector<register_set *> &m_spec_cores_out;
+  unsigned m_num_issued_last_cycle;
+  unsigned m_current_turn_warp;
 
   int m_id;
 };
@@ -450,6 +463,25 @@ class lrr_scheduler : public scheduler_unit {
   }
 };
 
+class rrr_scheduler : public scheduler_unit {
+ public:
+  rrr_scheduler(shader_core_stats *stats, shader_core_ctx *shader,
+                Scoreboard *scoreboard, simt_stack **simt,
+                std::vector<shd_warp_t *> *warp, register_set *sp_out,
+                register_set *dp_out, register_set *sfu_out,
+                register_set *int_out, register_set *tensor_core_out,
+                std::vector<register_set *> &spec_cores_out,
+                register_set *mem_out, int id)
+      : scheduler_unit(stats, shader, scoreboard, simt, warp, sp_out, dp_out,
+                       sfu_out, int_out, tensor_core_out, spec_cores_out,
+                       mem_out, id) {}
+  virtual ~rrr_scheduler() {}
+  virtual void order_warps();
+  virtual void done_adding_supervised_warps() {
+    m_last_supervised_issued = m_supervised_warps.end();
+  }
+};
+
 class gto_scheduler : public scheduler_unit {
  public:
   gto_scheduler(shader_core_stats *stats, shader_core_ctx *shader,
@@ -878,11 +910,13 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     }
     unsigned get_sp_op() const { return m_warp->sp_op; }
     unsigned get_id() const { return m_cuid; }  // returns CU hw id
+    unsigned get_reg_id() const { return m_reg_id; }
 
     // modifiers
     void init(unsigned n, unsigned num_banks, unsigned log2_warp_size,
               const core_config *config, opndcoll_rfu_t *rfu,
-              bool m_sub_core_model, unsigned num_banks_per_sched);
+              bool m_sub_core_model, unsigned reg_id,
+              unsigned num_banks_per_sched);
     bool allocate(register_set *pipeline_reg, register_set *output_reg);
 
     void collect_operand(unsigned op) { m_not_ready.reset(op); }
@@ -906,6 +940,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
 
     unsigned m_num_banks_per_sched;
     bool m_sub_core_model;
+    unsigned m_reg_id;  // if sub_core_model enabled, limit regs this cu can r/w
   };
 
   class dispatch_unit_t {
@@ -916,13 +951,44 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_num_collectors = (*cus).size();
       m_next_cu = 0;
     }
+    void init(bool sub_core_model, unsigned num_warp_scheds) {
+      m_sub_core_model = sub_core_model;
+      m_num_warp_scheds = num_warp_scheds;
+      if (m_sub_core_model) {
+        m_last_cu_set = new unsigned(m_num_warp_scheds);
+        for (unsigned i = 0; i < m_num_warp_scheds; i++)
+        {
+          m_last_cu_set[i] = i * m_num_collectors / m_num_warp_scheds;
+        }
+      }
+      
+    }
 
     collector_unit_t *find_ready() {
-      for (unsigned n = 0; n < m_num_collectors; n++) {
-        unsigned c = (m_last_cu + n + 1) % m_num_collectors;
-        if ((*m_collector_units)[c].ready()) {
-          m_last_cu = c;
-          return &((*m_collector_units)[c]);
+      if (m_sub_core_model) {
+        assert(m_num_collectors % m_num_warp_scheds == 0 &&
+                 m_num_collectors >= m_num_warp_scheds);
+        unsigned cusPerSched = m_num_collectors / m_num_warp_scheds;
+        for (unsigned i = 0; i < m_num_warp_scheds; i++) {
+          unsigned cuLowerBound = i * cusPerSched;
+          unsigned cuUpperBound = cuLowerBound + cusPerSched;
+          assert(0 <= cuLowerBound && cuUpperBound <= m_num_collectors);
+          assert(cuLowerBound <= m_last_cu_set[i] && m_last_cu_set[i] <= cuUpperBound);
+          for (unsigned j = cuLowerBound; j < cuUpperBound; j++) {
+            unsigned c = cuLowerBound + (m_last_cu_set[i] + j + 1) % cusPerSched;
+            if ((*m_collector_units)[c].ready()) {
+            m_last_cu_set[i] = c;
+            return &((*m_collector_units)[c]);
+            }
+          }
+        }
+      } else {
+        for (unsigned n = 0; n < m_num_collectors; n++) {
+          unsigned c = (m_last_cu + n + 1) % m_num_collectors;
+          if ((*m_collector_units)[c].ready()) {
+            m_last_cu = c;
+            return &((*m_collector_units)[c]);
+          }
         }
       }
       return NULL;
@@ -932,7 +998,11 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     unsigned m_num_collectors;
     std::vector<collector_unit_t> *m_collector_units;
     unsigned m_last_cu;  // dispatch ready cu's rr
+    unsigned *m_last_cu_set;
     unsigned m_next_cu;  // for initialization
+
+    bool m_sub_core_model;
+    unsigned m_num_warp_scheds;
   };
 
   // opndcoll_rfu_t data members
@@ -947,7 +1017,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
   arbiter_t m_arbiter;
 
   unsigned m_num_banks_per_sched;
-  unsigned m_num_warp_sceds;
+  unsigned m_num_warp_scheds;
   bool sub_core_model;
 
   // unsigned m_num_ports;
@@ -1039,10 +1109,7 @@ class simd_function_unit {
   ~simd_function_unit() { delete m_dispatch_reg; }
 
   // modifiers
-  virtual void issue(register_set &source_reg) {
-    source_reg.move_out_to(m_dispatch_reg);
-    occupied.set(m_dispatch_reg->latency);
-  }
+  virtual void issue(register_set &source_reg);
   virtual void cycle() = 0;
   virtual void active_lanes_in_pipeline() = 0;
 
@@ -1051,6 +1118,8 @@ class simd_function_unit {
   virtual bool can_issue(const warp_inst_t &inst) const {
     return m_dispatch_reg->empty() && !occupied.test(inst.latency);
   }
+  virtual bool is_issue_partitioned() = 0;
+  virtual unsigned get_issue_reg_id() = 0;
   virtual bool stallable() const = 0;
   virtual void print(FILE *fp) const {
     fprintf(fp, "%s dispatch= ", m_name.c_str());
@@ -1070,7 +1139,7 @@ class pipelined_simd_unit : public simd_function_unit {
  public:
   pipelined_simd_unit(register_set *result_port,
                       const shader_core_config *config, unsigned max_latency,
-                      shader_core_ctx *core);
+                      shader_core_ctx *core, unsigned issue_reg_id);
 
   // modifiers
   virtual void cycle();
@@ -1091,6 +1160,8 @@ class pipelined_simd_unit : public simd_function_unit {
   virtual bool can_issue(const warp_inst_t &inst) const {
     return simd_function_unit::can_issue(inst);
   }
+  virtual bool is_issue_partitioned() = 0;
+  unsigned get_issue_reg_id() { return m_issue_reg_id; }
   virtual void print(FILE *fp) const {
     simd_function_unit::print(fp);
     for (int s = m_pipeline_depth - 1; s >= 0; s--) {
@@ -1106,6 +1177,8 @@ class pipelined_simd_unit : public simd_function_unit {
   warp_inst_t **m_pipeline_reg;
   register_set *m_result_port;
   class shader_core_ctx *m_core;
+  unsigned m_issue_reg_id;  // if sub_core_model is enabled we can only issue
+                            // from a subset of operand collectors
 
   unsigned active_insts_in_pipeline;
 };
@@ -1113,7 +1186,7 @@ class pipelined_simd_unit : public simd_function_unit {
 class sfu : public pipelined_simd_unit {
  public:
   sfu(register_set *result_port, const shader_core_config *config,
-      shader_core_ctx *core);
+      shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case SFU_OP:
@@ -1129,12 +1202,13 @@ class sfu : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class dp_unit : public pipelined_simd_unit {
  public:
   dp_unit(register_set *result_port, const shader_core_config *config,
-          shader_core_ctx *core);
+          shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case DP_OP:
@@ -1146,12 +1220,13 @@ class dp_unit : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class tensor_core : public pipelined_simd_unit {
  public:
   tensor_core(register_set *result_port, const shader_core_config *config,
-              shader_core_ctx *core);
+              shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case TENSOR_CORE_OP:
@@ -1163,12 +1238,13 @@ class tensor_core : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class int_unit : public pipelined_simd_unit {
  public:
   int_unit(register_set *result_port, const shader_core_config *config,
-           shader_core_ctx *core);
+           shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case SFU_OP:
@@ -1194,12 +1270,13 @@ class int_unit : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class sp_unit : public pipelined_simd_unit {
  public:
   sp_unit(register_set *result_port, const shader_core_config *config,
-          shader_core_ctx *core);
+          shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case SFU_OP:
@@ -1223,13 +1300,14 @@ class sp_unit : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class specialized_unit : public pipelined_simd_unit {
  public:
   specialized_unit(register_set *result_port, const shader_core_config *config,
                    shader_core_ctx *core, unsigned supported_op,
-                   char *unit_name, unsigned latency);
+                   char *unit_name, unsigned latency, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     if (inst.op != m_supported_op) {
       return false;
@@ -1238,6 +1316,7 @@ class specialized_unit : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 
  private:
   unsigned m_supported_op;
@@ -1259,6 +1338,7 @@ class ldst_unit : public pipelined_simd_unit {
 
   // modifiers
   virtual void issue(register_set &inst);
+  bool is_issue_partitioned() { return false; }
   virtual void cycle();
 
   void fill(mem_fetch *mf);
@@ -1479,6 +1559,17 @@ class shader_core_config : public core_config {
       } else
         break;  // we only accept continuous specialized_units, i.e., 1,2,3,4
     }
+
+    // parse gpgpu_shmem_option for adpative cache config
+    if (adaptive_cache_config) {
+      std::stringstream ss(gpgpu_shmem_option);
+      while (ss.good()) {
+        std::string option;
+        std::getline(ss, option, ',');
+        shmem_opt_list.push_back((unsigned)std::stoi(option) * 1024);
+      }
+      std::sort(shmem_opt_list.begin(), shmem_opt_list.end());
+    }
   }
   void reg_options(class OptionParser *opp);
   unsigned max_cta(const kernel_info_t &k) const;
@@ -1619,18 +1710,26 @@ struct shader_core_stats_pod {
   unsigned *m_num_INTdecoded_insn;
   unsigned *m_num_storequeued_insn;
   unsigned *m_num_loadqueued_insn;
-  unsigned *m_num_ialu_acesses;
-  unsigned *m_num_fp_acesses;
-  unsigned *m_num_imul_acesses;
   unsigned *m_num_tex_inst;
-  unsigned *m_num_fpmul_acesses;
-  unsigned *m_num_idiv_acesses;
-  unsigned *m_num_fpdiv_acesses;
-  unsigned *m_num_sp_acesses;
-  unsigned *m_num_sfu_acesses;
-  unsigned *m_num_tensor_core_acesses;
-  unsigned *m_num_trans_acesses;
-  unsigned *m_num_mem_acesses;
+  double *m_num_ialu_acesses;
+  double *m_num_fp_acesses;
+  double *m_num_imul_acesses;
+  double *m_num_fpmul_acesses;
+  double *m_num_idiv_acesses;
+  double *m_num_fpdiv_acesses;
+  double *m_num_sp_acesses;
+  double *m_num_sfu_acesses;
+  double *m_num_tensor_core_acesses;
+  double *m_num_tex_acesses;
+  double *m_num_const_acesses;
+  double *m_num_dp_acesses;
+  double *m_num_dpmul_acesses;
+  double *m_num_dpdiv_acesses;
+  double *m_num_sqrt_acesses;
+  double *m_num_log_acesses;
+  double *m_num_sin_acesses;
+  double *m_num_exp_acesses;
+  double *m_num_mem_acesses;
   unsigned *m_num_sp_committed;
   unsigned *m_num_tlb_hits;
   unsigned *m_num_tlb_accesses;
@@ -1640,13 +1739,15 @@ struct shader_core_stats_pod {
   unsigned *m_read_regfile_acesses;
   unsigned *m_write_regfile_acesses;
   unsigned *m_non_rf_operands;
-  unsigned *m_num_imul24_acesses;
-  unsigned *m_num_imul32_acesses;
+  double *m_num_imul24_acesses;
+  double *m_num_imul32_acesses;
   unsigned *m_active_sp_lanes;
   unsigned *m_active_sfu_lanes;
   unsigned *m_active_tensor_core_lanes;
   unsigned *m_active_fu_lanes;
   unsigned *m_active_fu_mem_lanes;
+  double *m_active_exu_threads; //For power model
+  double *m_active_exu_warps; //For power model
   unsigned *m_n_diverge;  // number of divergence occurring in this shader
   unsigned gpgpu_n_load_insn;
   unsigned gpgpu_n_store_insn;
@@ -1717,38 +1818,56 @@ class shader_core_stats : public shader_core_stats_pod {
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_loadqueued_insn =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+    m_num_tex_inst = 
+        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_INTdecoded_insn =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_ialu_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_fp_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_tex_inst = (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_imul_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_imul24_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_imul32_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_fpmul_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_idiv_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_fpdiv_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_dp_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
+    m_num_dpmul_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
+    m_num_dpdiv_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
     m_num_sp_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_sfu_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_tensor_core_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_trans_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_tensor_core_acesses = 
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_const_acesses =
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_tex_acesses =
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_sqrt_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
+    m_num_log_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
+    m_num_sin_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
+    m_num_exp_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
     m_num_mem_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_sp_committed =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_tlb_hits = (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+    m_num_tlb_hits = 
+        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_tlb_accesses =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_active_sp_lanes =
@@ -1759,6 +1878,10 @@ class shader_core_stats : public shader_core_stats_pod {
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_active_fu_lanes =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+    m_active_exu_threads =
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_active_exu_warps =
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_active_fu_mem_lanes =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_sfu_committed =
@@ -1773,7 +1896,8 @@ class shader_core_stats : public shader_core_stats_pod {
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_non_rf_operands =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_n_diverge = (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+    m_n_diverge = 
+        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     shader_cycle_distro =
         (unsigned *)calloc(config->warp_size + 3, sizeof(unsigned));
     last_shader_cycle_distro =
@@ -1802,6 +1926,48 @@ class shader_core_stats : public shader_core_stats_pod {
     delete m_incoming_traffic_stats;
     free(m_num_sim_insn);
     free(m_num_sim_winsn);
+    free(m_num_FPdecoded_insn);
+    free(m_num_INTdecoded_insn);
+    free(m_num_storequeued_insn);
+    free(m_num_loadqueued_insn);
+    free(m_num_ialu_acesses);
+    free(m_num_fp_acesses);
+    free(m_num_imul_acesses);
+    free(m_num_tex_inst);
+    free(m_num_fpmul_acesses);
+    free(m_num_idiv_acesses);
+    free(m_num_fpdiv_acesses);
+    free(m_num_sp_acesses);
+    free(m_num_sfu_acesses);
+    free(m_num_tensor_core_acesses);
+    free(m_num_tex_acesses);
+    free(m_num_const_acesses);
+    free(m_num_dp_acesses);
+    free(m_num_dpmul_acesses);
+    free(m_num_dpdiv_acesses);
+    free(m_num_sqrt_acesses);
+    free(m_num_log_acesses);
+    free(m_num_sin_acesses);
+    free(m_num_exp_acesses);
+    free(m_num_mem_acesses);
+    free(m_num_sp_committed);
+    free(m_num_tlb_hits);
+    free(m_num_tlb_accesses);
+    free(m_num_sfu_committed);
+    free(m_num_tensor_core_committed);
+    free(m_num_mem_committed);
+    free(m_read_regfile_acesses);
+    free(m_write_regfile_acesses);
+    free(m_non_rf_operands);
+    free(m_num_imul24_acesses);
+    free(m_num_imul32_acesses);
+    free(m_active_sp_lanes);
+    free(m_active_sfu_lanes);
+    free(m_active_tensor_core_lanes);
+    free(m_active_fu_lanes);
+    free(m_active_exu_threads);
+    free(m_active_exu_warps);
+    free(m_active_fu_mem_lanes);
     free(m_n_diverge);
     free(shader_cycle_distro);
     free(last_shader_cycle_distro);
@@ -1856,6 +2022,12 @@ class shader_core_mem_fetch_allocator : public mem_fetch_allocator {
   }
   mem_fetch *alloc(new_addr_type addr, mem_access_type type, unsigned size,
                    bool wr, unsigned long long cycle) const;
+  mem_fetch *alloc(new_addr_type addr, mem_access_type type,
+                   const active_mask_t &active_mask,
+                   const mem_access_byte_mask_t &byte_mask,
+                   const mem_access_sector_mask_t &sector_mask, unsigned size,
+                   bool wr, unsigned long long cycle, unsigned wid,
+                   unsigned sid, unsigned tpc, mem_fetch *original_mf) const;
   mem_fetch *alloc(const warp_inst_t &inst, const mem_access_t &access,
                    unsigned long long cycle) const {
     warp_inst_t inst_copy = inst;
@@ -1900,7 +2072,7 @@ class shader_core_ctx : public core_t {
     printf("GPGPU-Sim uArch: Shader %d bind to kernel %u \'%s\'\n", m_sid,
            m_kernel->get_uid(), m_kernel->name().c_str());
   }
-
+  PowerscalingCoefficients *scaling_coeffs;
   // accessors
   bool fetch_unit_response_buffer_full() const;
   bool ldst_unit_response_buffer_full() const;
@@ -1958,119 +2130,206 @@ class shader_core_ctx : public core_t {
 
   void incload_stat() { m_stats->m_num_loadqueued_insn[m_sid]++; }
   void incstore_stat() { m_stats->m_num_storequeued_insn[m_sid]++; }
-  void incialu_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_ialu_acesses[m_sid] =
-          m_stats->m_num_ialu_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_nonsfu(active_count, latency);
-    } else {
-      m_stats->m_num_ialu_acesses[m_sid] =
-          m_stats->m_num_ialu_acesses[m_sid] + active_count * latency;
+  void incialu_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_ialu_acesses[m_sid]=m_stats->m_num_ialu_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+      m_stats->m_num_ialu_acesses[m_sid]=m_stats->m_num_ialu_acesses[m_sid]+(double)active_count*latency;
     }
-  }
-  void inctex_stat(unsigned active_count, double latency) {
-    m_stats->m_num_tex_inst[m_sid] =
-        m_stats->m_num_tex_inst[m_sid] + active_count * latency;
-  }
-  void incimul_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_imul_acesses[m_sid] =
-          m_stats->m_num_imul_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_nonsfu(active_count, latency);
-    } else {
-      m_stats->m_num_imul_acesses[m_sid] =
-          m_stats->m_num_imul_acesses[m_sid] + active_count * latency;
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+  void incimul_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_imul_acesses[m_sid]=m_stats->m_num_imul_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+      m_stats->m_num_imul_acesses[m_sid]=m_stats->m_num_imul_acesses[m_sid]+(double)active_count*latency;
     }
-  }
-  void incimul24_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_imul24_acesses[m_sid] =
-          m_stats->m_num_imul24_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_nonsfu(active_count, latency);
-    } else {
-      m_stats->m_num_imul24_acesses[m_sid] =
-          m_stats->m_num_imul24_acesses[m_sid] + active_count * latency;
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+  void incimul24_stat(unsigned active_count,double latency) {
+  if(m_config->gpgpu_clock_gated_lanes==false){
+    m_stats->m_num_imul24_acesses[m_sid]=m_stats->m_num_imul24_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+      m_stats->m_num_imul24_acesses[m_sid]=m_stats->m_num_imul24_acesses[m_sid]+(double)active_count*latency;
     }
-  }
-  void incimul32_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_imul32_acesses[m_sid] =
-          m_stats->m_num_imul32_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_sfu(active_count, latency);
-    } else {
-      m_stats->m_num_imul32_acesses[m_sid] =
-          m_stats->m_num_imul32_acesses[m_sid] + active_count * latency;
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;    
+   }
+   void incimul32_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_imul32_acesses[m_sid]=m_stats->m_num_imul32_acesses[m_sid]+(double)active_count*latency
+         + inactive_lanes_accesses_sfu(active_count, latency);          
+    }else{
+      m_stats->m_num_imul32_acesses[m_sid]=m_stats->m_num_imul32_acesses[m_sid]+(double)active_count*latency;
     }
-    // printf("Int_Mul -- Active_count: %d\n",active_count);
-  }
-  void incidiv_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_idiv_acesses[m_sid] =
-          m_stats->m_num_idiv_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_sfu(active_count, latency);
-    } else {
-      m_stats->m_num_idiv_acesses[m_sid] =
-          m_stats->m_num_idiv_acesses[m_sid] + active_count * latency;
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+   void incidiv_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_idiv_acesses[m_sid]=m_stats->m_num_idiv_acesses[m_sid]+(double)active_count*latency
+         + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else {
+      m_stats->m_num_idiv_acesses[m_sid]=m_stats->m_num_idiv_acesses[m_sid]+(double)active_count*latency;
     }
-  }
-  void incfpalu_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_fp_acesses[m_sid] =
-          m_stats->m_num_fp_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_nonsfu(active_count, latency);
-    } else {
-      m_stats->m_num_fp_acesses[m_sid] =
-          m_stats->m_num_fp_acesses[m_sid] + active_count * latency;
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;    
+  }
+   void incfpalu_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_fp_acesses[m_sid]=m_stats->m_num_fp_acesses[m_sid]+(double)active_count*latency
+         + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+    m_stats->m_num_fp_acesses[m_sid]=m_stats->m_num_fp_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;     
+  }
+   void incfpmul_stat(unsigned active_count,double latency) {
+              // printf("FP MUL stat increament\n");
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_fpmul_acesses[m_sid]=m_stats->m_num_fpmul_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+    m_stats->m_num_fpmul_acesses[m_sid]=m_stats->m_num_fpmul_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+   }
+   void incfpdiv_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_fpdiv_acesses[m_sid]=m_stats->m_num_fpdiv_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else {
+      m_stats->m_num_fpdiv_acesses[m_sid]=m_stats->m_num_fpdiv_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+   }
+   void incdpalu_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_dp_acesses[m_sid]=m_stats->m_num_dp_acesses[m_sid]+(double)active_count*latency
+         + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+    m_stats->m_num_dp_acesses[m_sid]=m_stats->m_num_dp_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++; 
+   }
+   void incdpmul_stat(unsigned active_count,double latency) {
+              // printf("FP MUL stat increament\n");
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_dpmul_acesses[m_sid]=m_stats->m_num_dpmul_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+    m_stats->m_num_dpmul_acesses[m_sid]=m_stats->m_num_dpmul_acesses[m_sid]+(double)active_count*latency;
     }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+   }
+   void incdpdiv_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_dpdiv_acesses[m_sid]=m_stats->m_num_dpdiv_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else {
+      m_stats->m_num_dpdiv_acesses[m_sid]=m_stats->m_num_dpdiv_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+   }
+
+   void incsqrt_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_sqrt_acesses[m_sid]=m_stats->m_num_sqrt_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else{
+      m_stats->m_num_sqrt_acesses[m_sid]=m_stats->m_num_sqrt_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+   }
+
+   void inclog_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_log_acesses[m_sid]=m_stats->m_num_log_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else{
+      m_stats->m_num_log_acesses[m_sid]=m_stats->m_num_log_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+   }
+
+   void incexp_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_exp_acesses[m_sid]=m_stats->m_num_exp_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else{
+      m_stats->m_num_exp_acesses[m_sid]=m_stats->m_num_exp_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
   }
-  void incfpmul_stat(unsigned active_count, double latency) {
-    // printf("FP MUL stat increament\n");
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_fpmul_acesses[m_sid] =
-          m_stats->m_num_fpmul_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_nonsfu(active_count, latency);
-    } else {
-      m_stats->m_num_fpmul_acesses[m_sid] =
-          m_stats->m_num_fpmul_acesses[m_sid] + active_count * latency;
+
+   void incsin_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_sin_acesses[m_sid]=m_stats->m_num_sin_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else{
+      m_stats->m_num_sin_acesses[m_sid]=m_stats->m_num_sin_acesses[m_sid]+(double)active_count*latency;
     }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
   }
-  void incfpdiv_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_fpdiv_acesses[m_sid] =
-          m_stats->m_num_fpdiv_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_sfu(active_count, latency);
-    } else {
-      m_stats->m_num_fpdiv_acesses[m_sid] =
-          m_stats->m_num_fpdiv_acesses[m_sid] + active_count * latency;
+
+
+   void inctensor_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_tensor_core_acesses[m_sid]=m_stats->m_num_tensor_core_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else{
+      m_stats->m_num_tensor_core_acesses[m_sid]=m_stats->m_num_tensor_core_acesses[m_sid]+(double)active_count*latency;
     }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
   }
-  void inctrans_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_trans_acesses[m_sid] =
-          m_stats->m_num_trans_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_sfu(active_count, latency);
-    } else {
-      m_stats->m_num_trans_acesses[m_sid] =
-          m_stats->m_num_trans_acesses[m_sid] + active_count * latency;
+
+  void inctex_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_tex_acesses[m_sid]=m_stats->m_num_tex_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else{
+      m_stats->m_num_tex_acesses[m_sid]=m_stats->m_num_tex_acesses[m_sid]+(double)active_count*latency;
     }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+
+  void inc_const_accesses(unsigned active_count) {
+    m_stats->m_num_const_acesses[m_sid]=m_stats->m_num_const_acesses[m_sid]+active_count;
   }
 
   void incsfu_stat(unsigned active_count, double latency) {
     m_stats->m_num_sfu_acesses[m_sid] =
-        m_stats->m_num_sfu_acesses[m_sid] + active_count * latency;
+        m_stats->m_num_sfu_acesses[m_sid] + (double)active_count*latency;
   }
   void incsp_stat(unsigned active_count, double latency) {
     m_stats->m_num_sp_acesses[m_sid] =
-        m_stats->m_num_sp_acesses[m_sid] + active_count * latency;
+        m_stats->m_num_sp_acesses[m_sid] + (double)active_count*latency;
   }
   void incmem_stat(unsigned active_count, double latency) {
     if (m_config->gpgpu_clock_gated_lanes == false) {
       m_stats->m_num_mem_acesses[m_sid] =
-          m_stats->m_num_mem_acesses[m_sid] + active_count * latency +
+          m_stats->m_num_mem_acesses[m_sid] + (double)active_count*latency +
           inactive_lanes_accesses_nonsfu(active_count, latency);
     } else {
       m_stats->m_num_mem_acesses[m_sid] =
-          m_stats->m_num_mem_acesses[m_sid] + active_count * latency;
+          m_stats->m_num_mem_acesses[m_sid] + (double)active_count*latency;
     }
   }
   void incexecstat(warp_inst_t *&inst);
@@ -2133,8 +2392,8 @@ class shader_core_ctx : public core_t {
   friend class TwoLevelScheduler;
   friend class LooseRoundRobbinScheduler;
   virtual void issue_warp(register_set &warp, const warp_inst_t *pI,
-                  const active_mask_t &active_mask, unsigned warp_id,
-                  unsigned sch_id);
+                          const active_mask_t &active_mask, unsigned warp_id,
+                          unsigned sch_id);
 
   void create_front_pipeline();
   void create_schedulers();
diff --git a/src/gpgpu-sim/stat-tool.cc b/src/gpgpu-sim/stat-tool.cc
index 6fafaa6af..0513d17ed 100644
--- a/src/gpgpu-sim/stat-tool.cc
+++ b/src/gpgpu-sim/stat-tool.cc
@@ -369,8 +369,6 @@ void shader_mem_lat_print(FILE *fout) {
 static int s_cache_access_logger_n_types = 0;
 static std::vector<linear_histogram_logger> s_cache_access_logger;
 
-enum cache_access_logger_types { NORMALS, TEXTURE, CONSTANT, INSTRUCTION };
-
 int get_shader_normal_cache_id() { return NORMALS; }
 int get_shader_texture_cache_id() { return TEXTURE; }
 int get_shader_constant_cache_id() { return CONSTANT; }
diff --git a/src/gpgpu-sim/stat-tool.h b/src/gpgpu-sim/stat-tool.h
index 3a291be3a..fdf875600 100644
--- a/src/gpgpu-sim/stat-tool.h
+++ b/src/gpgpu-sim/stat-tool.h
@@ -268,6 +268,8 @@ class linear_histogram_logger : public snap_shot_trigger,
   static int s_ids;
 };
 
+enum cache_access_logger_types { NORMALS, TEXTURE, CONSTANT, INSTRUCTION };
+
 void try_snap_shot(unsigned long long current_cycle);
 void set_spill_interval(unsigned long long interval);
 void spill_log_to_file(FILE *fout, int final, unsigned long long current_cycle);
diff --git a/src/gpuwattch/gpgpu_sim_wrapper.cc b/src/gpuwattch/gpgpu_sim_wrapper.cc
deleted file mode 100644
index f2989f630..000000000
--- a/src/gpuwattch/gpgpu_sim_wrapper.cc
+++ /dev/null
@@ -1,863 +0,0 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy,
-// The University of British Columbia
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-
-#include "gpgpu_sim_wrapper.h"
-#include <sys/stat.h>
-#define SP_BASE_POWER 0
-#define SFU_BASE_POWER 0
-
-static const char* pwr_cmp_label[] = {
-    "IBP,", "ICP,",  "DCP,",   "TCP,",   "CCP,",        "SHRDP,",
-    "RFP,", "SPP,",  "SFUP,",  "FPUP,",  "SCHEDP,",     "L2CP,",
-    "MCP,", "NOCP,", "DRAMP,", "PIPEP,", "IDLE_COREP,", "CONST_DYNAMICP"};
-
-enum pwr_cmp_t {
-  IBP = 0,
-  ICP,
-  DCP,
-  TCP,
-  CCP,
-  SHRDP,
-  RFP,
-  SPP,
-  SFUP,
-  FPUP,
-  SCHEDP,
-  L2CP,
-  MCP,
-  NOCP,
-  DRAMP,
-  PIPEP,
-  IDLE_COREP,
-  CONST_DYNAMICP,
-  NUM_COMPONENTS_MODELLED
-};
-
-gpgpu_sim_wrapper::gpgpu_sim_wrapper(bool power_simulation_enabled,
-                                     char* xmlfile) {
-  kernel_sample_count = 0;
-  total_sample_count = 0;
-
-  kernel_tot_power = 0;
-
-  num_pwr_cmps = NUM_COMPONENTS_MODELLED;
-  num_perf_counters = NUM_PERFORMANCE_COUNTERS;
-
-  // Initialize per-component counter/power vectors
-  avg_max_min_counters<double> init;
-  kernel_cmp_pwr.resize(NUM_COMPONENTS_MODELLED, init);
-  kernel_cmp_perf_counters.resize(NUM_PERFORMANCE_COUNTERS, init);
-
-  kernel_power = init;   // Per-kernel powers
-  gpu_tot_power = init;  // Global powers
-
-  sample_cmp_pwr.resize(NUM_COMPONENTS_MODELLED, 0);
-
-  sample_perf_counters.resize(NUM_PERFORMANCE_COUNTERS, 0);
-  initpower_coeff.resize(NUM_PERFORMANCE_COUNTERS, 0);
-  effpower_coeff.resize(NUM_PERFORMANCE_COUNTERS, 0);
-
-  const_dynamic_power = 0;
-  proc_power = 0;
-
-  g_power_filename = NULL;
-  g_power_trace_filename = NULL;
-  g_metric_trace_filename = NULL;
-  g_steady_state_tracking_filename = NULL;
-  xml_filename = xmlfile;
-  g_power_simulation_enabled = power_simulation_enabled;
-  g_power_trace_enabled = false;
-  g_steady_power_levels_enabled = false;
-  g_power_trace_zlevel = 0;
-  g_power_per_cycle_dump = false;
-  gpu_steady_power_deviation = 0;
-  gpu_steady_min_period = 0;
-
-  gpu_stat_sample_freq = 0;
-  p = new ParseXML();
-  if (g_power_simulation_enabled) {
-    p->parse(xml_filename);
-  }
-  proc = new Processor(p);
-  power_trace_file = NULL;
-  metric_trace_file = NULL;
-  steady_state_tacking_file = NULL;
-  has_written_avg = false;
-  init_inst_val = false;
-}
-
-gpgpu_sim_wrapper::~gpgpu_sim_wrapper() {}
-
-bool gpgpu_sim_wrapper::sanity_check(double a, double b) {
-  if (b == 0)
-    return (abs(a - b) < 0.00001);
-  else
-    return (abs(a - b) / abs(b) < 0.00001);
-
-  return false;
-}
-void gpgpu_sim_wrapper::init_mcpat(
-    char* xmlfile, char* powerfilename, char* power_trace_filename,
-    char* metric_trace_filename, char* steady_state_filename,
-    bool power_sim_enabled, bool trace_enabled, bool steady_state_enabled,
-    bool power_per_cycle_dump, double steady_power_deviation,
-    double steady_min_period, int zlevel, double init_val,
-    int stat_sample_freq) {
-  // Write File Headers for (-metrics trace, -power trace)
-
-  reset_counters();
-  static bool mcpat_init = true;
-
-  // initialize file name if it is not set
-  time_t curr_time;
-  time(&curr_time);
-  char* date = ctime(&curr_time);
-  char* s = date;
-  while (*s) {
-    if (*s == ' ' || *s == '\t' || *s == ':') *s = '-';
-    if (*s == '\n' || *s == '\r') *s = 0;
-    s++;
-  }
-
-  if (mcpat_init) {
-    g_power_filename = powerfilename;
-    g_power_trace_filename = power_trace_filename;
-    g_metric_trace_filename = metric_trace_filename;
-    g_steady_state_tracking_filename = steady_state_filename;
-    xml_filename = xmlfile;
-    g_power_simulation_enabled = power_sim_enabled;
-    g_power_trace_enabled = trace_enabled;
-    g_steady_power_levels_enabled = steady_state_enabled;
-    g_power_trace_zlevel = zlevel;
-    g_power_per_cycle_dump = power_per_cycle_dump;
-    gpu_steady_power_deviation = steady_power_deviation;
-    gpu_steady_min_period = steady_min_period;
-
-    gpu_stat_sample_freq = stat_sample_freq;
-
-    // p->sys.total_cycles=gpu_stat_sample_freq*4;
-    p->sys.total_cycles = gpu_stat_sample_freq;
-    power_trace_file = NULL;
-    metric_trace_file = NULL;
-    steady_state_tacking_file = NULL;
-
-    if (g_power_trace_enabled) {
-      power_trace_file = gzopen(g_power_trace_filename, "w");
-      metric_trace_file = gzopen(g_metric_trace_filename, "w");
-      if ((power_trace_file == NULL) || (metric_trace_file == NULL)) {
-        printf("error - could not open trace files \n");
-        exit(1);
-      }
-      gzsetparams(power_trace_file, g_power_trace_zlevel, Z_DEFAULT_STRATEGY);
-
-      gzprintf(power_trace_file, "power,");
-      for (unsigned i = 0; i < num_pwr_cmps; i++) {
-        gzprintf(power_trace_file, pwr_cmp_label[i]);
-      }
-      gzprintf(power_trace_file, "\n");
-
-      gzsetparams(metric_trace_file, g_power_trace_zlevel, Z_DEFAULT_STRATEGY);
-      for (unsigned i = 0; i < num_perf_counters; i++) {
-        gzprintf(metric_trace_file, perf_count_label[i]);
-      }
-      gzprintf(metric_trace_file, "\n");
-
-      gzclose(power_trace_file);
-      gzclose(metric_trace_file);
-    }
-    if (g_steady_power_levels_enabled) {
-      steady_state_tacking_file = gzopen(g_steady_state_tracking_filename, "w");
-      if ((steady_state_tacking_file == NULL)) {
-        printf("error - could not open trace files \n");
-        exit(1);
-      }
-      gzsetparams(steady_state_tacking_file, g_power_trace_zlevel,
-                  Z_DEFAULT_STRATEGY);
-      gzprintf(steady_state_tacking_file, "start,end,power,IPC,");
-      for (unsigned i = 0; i < num_perf_counters; i++) {
-        gzprintf(steady_state_tacking_file, perf_count_label[i]);
-      }
-      gzprintf(steady_state_tacking_file, "\n");
-
-      gzclose(steady_state_tacking_file);
-    }
-
-    mcpat_init = false;
-    has_written_avg = false;
-    powerfile.open(g_power_filename);
-    int flg = chmod(g_power_filename, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
-    assert(flg == 0);
-  }
-  sample_val = 0;
-  init_inst_val = init_val;  // gpu_tot_sim_insn+gpu_sim_insn;
-}
-
-void gpgpu_sim_wrapper::reset_counters() {
-  avg_max_min_counters<double> init;
-  for (unsigned i = 0; i < num_perf_counters; ++i) {
-    sample_perf_counters[i] = 0;
-    kernel_cmp_perf_counters[i] = init;
-  }
-  for (unsigned i = 0; i < num_pwr_cmps; ++i) {
-    sample_cmp_pwr[i] = 0;
-    kernel_cmp_pwr[i] = init;
-  }
-
-  // Reset per-kernel counters
-  kernel_sample_count = 0;
-  kernel_tot_power = 0;
-  kernel_power = init;
-
-  return;
-}
-
-void gpgpu_sim_wrapper::set_inst_power(bool clk_gated_lanes, double tot_cycles,
-                                       double busy_cycles, double tot_inst,
-                                       double int_inst, double fp_inst,
-                                       double load_inst, double store_inst,
-                                       double committed_inst) {
-  p->sys.core[0].gpgpu_clock_gated_lanes = clk_gated_lanes;
-  p->sys.core[0].total_cycles = tot_cycles;
-  p->sys.core[0].busy_cycles = busy_cycles;
-  p->sys.core[0].total_instructions =
-      tot_inst * p->sys.scaling_coefficients[TOT_INST];
-  p->sys.core[0].int_instructions =
-      int_inst * p->sys.scaling_coefficients[FP_INT];
-  p->sys.core[0].fp_instructions =
-      fp_inst * p->sys.scaling_coefficients[FP_INT];
-  p->sys.core[0].load_instructions = load_inst;
-  p->sys.core[0].store_instructions = store_inst;
-  p->sys.core[0].committed_instructions = committed_inst;
-  sample_perf_counters[FP_INT] = int_inst + fp_inst;
-  sample_perf_counters[TOT_INST] = tot_inst;
-}
-
-void gpgpu_sim_wrapper::set_regfile_power(double reads, double writes,
-                                          double ops) {
-  p->sys.core[0].int_regfile_reads =
-      reads * p->sys.scaling_coefficients[REG_RD];
-  p->sys.core[0].int_regfile_writes =
-      writes * p->sys.scaling_coefficients[REG_WR];
-  p->sys.core[0].non_rf_operands =
-      ops * p->sys.scaling_coefficients[NON_REG_OPs];
-  sample_perf_counters[REG_RD] = reads;
-  sample_perf_counters[REG_WR] = writes;
-  sample_perf_counters[NON_REG_OPs] = ops;
-}
-
-void gpgpu_sim_wrapper::set_icache_power(double hits, double misses) {
-  p->sys.core[0].icache.read_accesses =
-      hits * p->sys.scaling_coefficients[IC_H] +
-      misses * p->sys.scaling_coefficients[IC_M];
-  p->sys.core[0].icache.read_misses =
-      misses * p->sys.scaling_coefficients[IC_M];
-  sample_perf_counters[IC_H] = hits;
-  sample_perf_counters[IC_M] = misses;
-}
-
-void gpgpu_sim_wrapper::set_ccache_power(double hits, double misses) {
-  p->sys.core[0].ccache.read_accesses =
-      hits * p->sys.scaling_coefficients[CC_H] +
-      misses * p->sys.scaling_coefficients[CC_M];
-  p->sys.core[0].ccache.read_misses =
-      misses * p->sys.scaling_coefficients[CC_M];
-  sample_perf_counters[CC_H] = hits;
-  sample_perf_counters[CC_M] = misses;
-  // TODO: coalescing logic is counted as part of the caches power (this is not
-  // valid for no-caches architectures)
-}
-
-void gpgpu_sim_wrapper::set_tcache_power(double hits, double misses) {
-  p->sys.core[0].tcache.read_accesses =
-      hits * p->sys.scaling_coefficients[TC_H] +
-      misses * p->sys.scaling_coefficients[TC_M];
-  p->sys.core[0].tcache.read_misses =
-      misses * p->sys.scaling_coefficients[TC_M];
-  sample_perf_counters[TC_H] = hits;
-  sample_perf_counters[TC_M] = misses;
-  // TODO: coalescing logic is counted as part of the caches power (this is not
-  // valid for no-caches architectures)
-}
-
-void gpgpu_sim_wrapper::set_shrd_mem_power(double accesses) {
-  p->sys.core[0].sharedmemory.read_accesses =
-      accesses * p->sys.scaling_coefficients[SHRD_ACC];
-  sample_perf_counters[SHRD_ACC] = accesses;
-}
-
-void gpgpu_sim_wrapper::set_l1cache_power(double read_hits, double read_misses,
-                                          double write_hits,
-                                          double write_misses) {
-  p->sys.core[0].dcache.read_accesses =
-      read_hits * p->sys.scaling_coefficients[DC_RH] +
-      read_misses * p->sys.scaling_coefficients[DC_RM];
-  p->sys.core[0].dcache.read_misses =
-      read_misses * p->sys.scaling_coefficients[DC_RM];
-  p->sys.core[0].dcache.write_accesses =
-      write_hits * p->sys.scaling_coefficients[DC_WH] +
-      write_misses * p->sys.scaling_coefficients[DC_WM];
-  p->sys.core[0].dcache.write_misses =
-      write_misses * p->sys.scaling_coefficients[DC_WM];
-  sample_perf_counters[DC_RH] = read_hits;
-  sample_perf_counters[DC_RM] = read_misses;
-  sample_perf_counters[DC_WH] = write_hits;
-  sample_perf_counters[DC_WM] = write_misses;
-  // TODO: coalescing logic is counted as part of the caches power (this is not
-  // valid for no-caches architectures)
-}
-
-void gpgpu_sim_wrapper::set_l2cache_power(double read_hits, double read_misses,
-                                          double write_hits,
-                                          double write_misses) {
-  p->sys.l2.total_accesses = read_hits * p->sys.scaling_coefficients[L2_RH] +
-                             read_misses * p->sys.scaling_coefficients[L2_RM] +
-                             write_hits * p->sys.scaling_coefficients[L2_WH] +
-                             write_misses * p->sys.scaling_coefficients[L2_WM];
-  p->sys.l2.read_accesses = read_hits * p->sys.scaling_coefficients[L2_RH] +
-                            read_misses * p->sys.scaling_coefficients[L2_RM];
-  p->sys.l2.write_accesses = write_hits * p->sys.scaling_coefficients[L2_WH] +
-                             write_misses * p->sys.scaling_coefficients[L2_WM];
-  p->sys.l2.read_hits = read_hits * p->sys.scaling_coefficients[L2_RH];
-  p->sys.l2.read_misses = read_misses * p->sys.scaling_coefficients[L2_RM];
-  p->sys.l2.write_hits = write_hits * p->sys.scaling_coefficients[L2_WH];
-  p->sys.l2.write_misses = write_misses * p->sys.scaling_coefficients[L2_WM];
-  sample_perf_counters[L2_RH] = read_hits;
-  sample_perf_counters[L2_RM] = read_misses;
-  sample_perf_counters[L2_WH] = write_hits;
-  sample_perf_counters[L2_WM] = write_misses;
-}
-
-void gpgpu_sim_wrapper::set_idle_core_power(double num_idle_core) {
-  p->sys.num_idle_cores = num_idle_core;
-  sample_perf_counters[IDLE_CORE_N] = num_idle_core;
-}
-
-void gpgpu_sim_wrapper::set_duty_cycle_power(double duty_cycle) {
-  p->sys.core[0].pipeline_duty_cycle =
-      duty_cycle * p->sys.scaling_coefficients[PIPE_A];
-  sample_perf_counters[PIPE_A] = duty_cycle;
-}
-
-void gpgpu_sim_wrapper::set_mem_ctrl_power(double reads, double writes,
-                                           double dram_precharge) {
-  p->sys.mc.memory_accesses = reads * p->sys.scaling_coefficients[MEM_RD] +
-                              writes * p->sys.scaling_coefficients[MEM_WR];
-  p->sys.mc.memory_reads = reads * p->sys.scaling_coefficients[MEM_RD];
-  p->sys.mc.memory_writes = writes * p->sys.scaling_coefficients[MEM_WR];
-  p->sys.mc.dram_pre = dram_precharge * p->sys.scaling_coefficients[MEM_PRE];
-  sample_perf_counters[MEM_RD] = reads;
-  sample_perf_counters[MEM_WR] = writes;
-  sample_perf_counters[MEM_PRE] = dram_precharge;
-}
-
-void gpgpu_sim_wrapper::set_exec_unit_power(double fpu_accesses,
-                                            double ialu_accesses,
-                                            double sfu_accesses) {
-  p->sys.core[0].fpu_accesses =
-      fpu_accesses * p->sys.scaling_coefficients[FPU_ACC];
-  // Integer ALU (not present in Tesla)
-  p->sys.core[0].ialu_accesses =
-      ialu_accesses * p->sys.scaling_coefficients[SP_ACC];
-  // Sfu accesses
-  p->sys.core[0].mul_accesses =
-      sfu_accesses * p->sys.scaling_coefficients[SFU_ACC];
-
-  sample_perf_counters[SP_ACC] = ialu_accesses;
-  sample_perf_counters[SFU_ACC] = sfu_accesses;
-  sample_perf_counters[FPU_ACC] = fpu_accesses;
-}
-
-void gpgpu_sim_wrapper::set_active_lanes_power(double sp_avg_active_lane,
-                                               double sfu_avg_active_lane) {
-  p->sys.core[0].sp_average_active_lanes = sp_avg_active_lane;
-  p->sys.core[0].sfu_average_active_lanes = sfu_avg_active_lane;
-}
-
-void gpgpu_sim_wrapper::set_NoC_power(double noc_tot_reads,
-                                      double noc_tot_writes) {
-  p->sys.NoC[0].total_accesses =
-      noc_tot_reads * p->sys.scaling_coefficients[NOC_A] +
-      noc_tot_writes * p->sys.scaling_coefficients[NOC_A];
-  sample_perf_counters[NOC_A] = noc_tot_reads + noc_tot_writes;
-}
-
-void gpgpu_sim_wrapper::power_metrics_calculations() {
-  total_sample_count++;
-  kernel_sample_count++;
-
-  // Current sample power
-  double sample_power =
-      proc->rt_power.readOp.dynamic + sample_cmp_pwr[CONST_DYNAMICP];
-
-  // Average power
-  // Previous + new + constant dynamic power (e.g., dynamic clocking power)
-  kernel_tot_power += sample_power;
-  kernel_power.avg = kernel_tot_power / kernel_sample_count;
-  for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
-    kernel_cmp_pwr[ind].avg += (double)sample_cmp_pwr[ind];
-  }
-
-  for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
-    kernel_cmp_perf_counters[ind].avg += (double)sample_perf_counters[ind];
-  }
-
-  // Max Power
-  if (sample_power > kernel_power.max) {
-    kernel_power.max = sample_power;
-    for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
-      kernel_cmp_pwr[ind].max = (double)sample_cmp_pwr[ind];
-    }
-    for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
-      kernel_cmp_perf_counters[ind].max = sample_perf_counters[ind];
-    }
-  }
-
-  // Min Power
-  if (sample_power < kernel_power.min || (kernel_power.min == 0)) {
-    kernel_power.min = sample_power;
-    for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
-      kernel_cmp_pwr[ind].min = (double)sample_cmp_pwr[ind];
-    }
-    for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
-      kernel_cmp_perf_counters[ind].min = sample_perf_counters[ind];
-    }
-  }
-
-  gpu_tot_power.avg = (gpu_tot_power.avg + sample_power);
-  gpu_tot_power.max =
-      (sample_power > gpu_tot_power.max) ? sample_power : gpu_tot_power.max;
-  gpu_tot_power.min =
-      ((sample_power < gpu_tot_power.min) || (gpu_tot_power.min == 0))
-          ? sample_power
-          : gpu_tot_power.min;
-}
-
-void gpgpu_sim_wrapper::print_trace_files() {
-  open_files();
-
-  for (unsigned i = 0; i < num_perf_counters; ++i) {
-    gzprintf(metric_trace_file, "%f,", sample_perf_counters[i]);
-  }
-  gzprintf(metric_trace_file, "\n");
-
-  gzprintf(power_trace_file, "%f,", proc_power);
-  for (unsigned i = 0; i < num_pwr_cmps; ++i) {
-    gzprintf(power_trace_file, "%f,", sample_cmp_pwr[i]);
-  }
-  gzprintf(power_trace_file, "\n");
-
-  close_files();
-}
-
-void gpgpu_sim_wrapper::update_coefficients() {
-  initpower_coeff[FP_INT] = proc->cores[0]->get_coefficient_fpint_insts();
-  effpower_coeff[FP_INT] =
-      initpower_coeff[FP_INT] * p->sys.scaling_coefficients[FP_INT];
-
-  initpower_coeff[TOT_INST] = proc->cores[0]->get_coefficient_tot_insts();
-  effpower_coeff[TOT_INST] =
-      initpower_coeff[TOT_INST] * p->sys.scaling_coefficients[TOT_INST];
-
-  initpower_coeff[REG_RD] =
-      proc->cores[0]->get_coefficient_regreads_accesses() *
-      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
-  initpower_coeff[REG_WR] =
-      proc->cores[0]->get_coefficient_regwrites_accesses() *
-      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
-  initpower_coeff[NON_REG_OPs] =
-      proc->cores[0]->get_coefficient_noregfileops_accesses() *
-      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
-  effpower_coeff[REG_RD] =
-      initpower_coeff[REG_RD] * p->sys.scaling_coefficients[REG_RD];
-  effpower_coeff[REG_WR] =
-      initpower_coeff[REG_WR] * p->sys.scaling_coefficients[REG_WR];
-  effpower_coeff[NON_REG_OPs] =
-      initpower_coeff[NON_REG_OPs] * p->sys.scaling_coefficients[NON_REG_OPs];
-
-  initpower_coeff[IC_H] = proc->cores[0]->get_coefficient_icache_hits();
-  initpower_coeff[IC_M] = proc->cores[0]->get_coefficient_icache_misses();
-  effpower_coeff[IC_H] =
-      initpower_coeff[IC_H] * p->sys.scaling_coefficients[IC_H];
-  effpower_coeff[IC_M] =
-      initpower_coeff[IC_M] * p->sys.scaling_coefficients[IC_M];
-
-  initpower_coeff[CC_H] = (proc->cores[0]->get_coefficient_ccache_readhits() +
-                           proc->get_coefficient_readcoalescing());
-  initpower_coeff[CC_M] = (proc->cores[0]->get_coefficient_ccache_readmisses() +
-                           proc->get_coefficient_readcoalescing());
-  effpower_coeff[CC_H] =
-      initpower_coeff[CC_H] * p->sys.scaling_coefficients[CC_H];
-  effpower_coeff[CC_M] =
-      initpower_coeff[CC_M] * p->sys.scaling_coefficients[CC_M];
-
-  initpower_coeff[TC_H] = (proc->cores[0]->get_coefficient_tcache_readhits() +
-                           proc->get_coefficient_readcoalescing());
-  initpower_coeff[TC_M] = (proc->cores[0]->get_coefficient_tcache_readmisses() +
-                           proc->get_coefficient_readcoalescing());
-  effpower_coeff[TC_H] =
-      initpower_coeff[TC_H] * p->sys.scaling_coefficients[TC_H];
-  effpower_coeff[TC_M] =
-      initpower_coeff[TC_M] * p->sys.scaling_coefficients[TC_M];
-
-  initpower_coeff[SHRD_ACC] =
-      proc->cores[0]->get_coefficient_sharedmemory_readhits();
-  effpower_coeff[SHRD_ACC] =
-      initpower_coeff[SHRD_ACC] * p->sys.scaling_coefficients[SHRD_ACC];
-
-  initpower_coeff[DC_RH] = (proc->cores[0]->get_coefficient_dcache_readhits() +
-                            proc->get_coefficient_readcoalescing());
-  initpower_coeff[DC_RM] =
-      (proc->cores[0]->get_coefficient_dcache_readmisses() +
-       proc->get_coefficient_readcoalescing());
-  initpower_coeff[DC_WH] = (proc->cores[0]->get_coefficient_dcache_writehits() +
-                            proc->get_coefficient_writecoalescing());
-  initpower_coeff[DC_WM] =
-      (proc->cores[0]->get_coefficient_dcache_writemisses() +
-       proc->get_coefficient_writecoalescing());
-  effpower_coeff[DC_RH] =
-      initpower_coeff[DC_RH] * p->sys.scaling_coefficients[DC_RH];
-  effpower_coeff[DC_RM] =
-      initpower_coeff[DC_RM] * p->sys.scaling_coefficients[DC_RM];
-  effpower_coeff[DC_WH] =
-      initpower_coeff[DC_WH] * p->sys.scaling_coefficients[DC_WH];
-  effpower_coeff[DC_WM] =
-      initpower_coeff[DC_WM] * p->sys.scaling_coefficients[DC_WM];
-
-  initpower_coeff[L2_RH] = proc->get_coefficient_l2_read_hits();
-  initpower_coeff[L2_RM] = proc->get_coefficient_l2_read_misses();
-  initpower_coeff[L2_WH] = proc->get_coefficient_l2_write_hits();
-  initpower_coeff[L2_WM] = proc->get_coefficient_l2_write_misses();
-  effpower_coeff[L2_RH] =
-      initpower_coeff[L2_RH] * p->sys.scaling_coefficients[L2_RH];
-  effpower_coeff[L2_RM] =
-      initpower_coeff[L2_RM] * p->sys.scaling_coefficients[L2_RM];
-  effpower_coeff[L2_WH] =
-      initpower_coeff[L2_WH] * p->sys.scaling_coefficients[L2_WH];
-  effpower_coeff[L2_WM] =
-      initpower_coeff[L2_WM] * p->sys.scaling_coefficients[L2_WM];
-
-  initpower_coeff[IDLE_CORE_N] =
-      p->sys.idle_core_power * proc->cores[0]->executionTime;
-  effpower_coeff[IDLE_CORE_N] =
-      initpower_coeff[IDLE_CORE_N] * p->sys.scaling_coefficients[IDLE_CORE_N];
-
-  initpower_coeff[PIPE_A] = proc->cores[0]->get_coefficient_duty_cycle();
-  effpower_coeff[PIPE_A] =
-      initpower_coeff[PIPE_A] * p->sys.scaling_coefficients[PIPE_A];
-
-  initpower_coeff[MEM_RD] = proc->get_coefficient_mem_reads();
-  initpower_coeff[MEM_WR] = proc->get_coefficient_mem_writes();
-  initpower_coeff[MEM_PRE] = proc->get_coefficient_mem_pre();
-  effpower_coeff[MEM_RD] =
-      initpower_coeff[MEM_RD] * p->sys.scaling_coefficients[MEM_RD];
-  effpower_coeff[MEM_WR] =
-      initpower_coeff[MEM_WR] * p->sys.scaling_coefficients[MEM_WR];
-  effpower_coeff[MEM_PRE] =
-      initpower_coeff[MEM_PRE] * p->sys.scaling_coefficients[MEM_PRE];
-
-  initpower_coeff[SP_ACC] =
-      proc->cores[0]->get_coefficient_ialu_accesses() *
-      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
-  ;
-  initpower_coeff[SFU_ACC] = proc->cores[0]->get_coefficient_sfu_accesses();
-  initpower_coeff[FPU_ACC] = proc->cores[0]->get_coefficient_fpu_accesses();
-
-  effpower_coeff[SP_ACC] =
-      initpower_coeff[SP_ACC] * p->sys.scaling_coefficients[SP_ACC];
-  effpower_coeff[SFU_ACC] =
-      initpower_coeff[SFU_ACC] * p->sys.scaling_coefficients[SFU_ACC];
-  effpower_coeff[FPU_ACC] =
-      initpower_coeff[FPU_ACC] * p->sys.scaling_coefficients[FPU_ACC];
-
-  initpower_coeff[NOC_A] = proc->get_coefficient_noc_accesses();
-  effpower_coeff[NOC_A] =
-      initpower_coeff[NOC_A] * p->sys.scaling_coefficients[NOC_A];
-
-  const_dynamic_power =
-      proc->get_const_dynamic_power() / (proc->cores[0]->executionTime);
-
-  for (unsigned i = 0; i < num_perf_counters; i++) {
-    initpower_coeff[i] /= (proc->cores[0]->executionTime);
-    effpower_coeff[i] /= (proc->cores[0]->executionTime);
-  }
-}
-
-void gpgpu_sim_wrapper::update_components_power() {
-  update_coefficients();
-
-  proc_power = proc->rt_power.readOp.dynamic;
-
-  sample_cmp_pwr[IBP] =
-      (proc->cores[0]->ifu->IB->rt_power.readOp.dynamic +
-       proc->cores[0]->ifu->IB->rt_power.writeOp.dynamic +
-       proc->cores[0]->ifu->ID_misc->rt_power.readOp.dynamic +
-       proc->cores[0]->ifu->ID_operand->rt_power.readOp.dynamic +
-       proc->cores[0]->ifu->ID_inst->rt_power.readOp.dynamic) /
-      (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[ICP] = proc->cores[0]->ifu->icache.rt_power.readOp.dynamic /
-                        (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[DCP] = proc->cores[0]->lsu->dcache.rt_power.readOp.dynamic /
-                        (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[TCP] = proc->cores[0]->lsu->tcache.rt_power.readOp.dynamic /
-                        (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[CCP] = proc->cores[0]->lsu->ccache.rt_power.readOp.dynamic /
-                        (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[SHRDP] =
-      proc->cores[0]->lsu->sharedmemory.rt_power.readOp.dynamic /
-      (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[RFP] =
-      (proc->cores[0]->exu->rfu->rt_power.readOp.dynamic /
-       (proc->cores[0]->executionTime)) *
-      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
-
-  sample_cmp_pwr[SPP] =
-      (proc->cores[0]->exu->exeu->rt_power.readOp.dynamic /
-       (proc->cores[0]->executionTime)) *
-      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
-
-  sample_cmp_pwr[SFUP] = (proc->cores[0]->exu->mul->rt_power.readOp.dynamic /
-                          (proc->cores[0]->executionTime));
-
-  sample_cmp_pwr[FPUP] = (proc->cores[0]->exu->fp_u->rt_power.readOp.dynamic /
-                          (proc->cores[0]->executionTime));
-
-  sample_cmp_pwr[SCHEDP] = proc->cores[0]->exu->scheu->rt_power.readOp.dynamic /
-                           (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[L2CP] = (proc->XML->sys.number_of_L2s > 0)
-                             ? proc->l2array[0]->rt_power.readOp.dynamic /
-                                   (proc->cores[0]->executionTime)
-                             : 0;
-
-  sample_cmp_pwr[MCP] = (proc->mc->rt_power.readOp.dynamic -
-                         proc->mc->dram->rt_power.readOp.dynamic) /
-                        (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[NOCP] =
-      proc->nocs[0]->rt_power.readOp.dynamic / (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[DRAMP] =
-      proc->mc->dram->rt_power.readOp.dynamic / (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[PIPEP] =
-      proc->cores[0]->Pipeline_energy / (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[IDLE_COREP] =
-      proc->cores[0]->IdleCoreEnergy / (proc->cores[0]->executionTime);
-
-  // This constant dynamic power (e.g., clock power) part is estimated via
-  // regression model.
-  sample_cmp_pwr[CONST_DYNAMICP] = 0;
-  double cnst_dyn =
-      proc->get_const_dynamic_power() / (proc->cores[0]->executionTime);
-  // If the regression scaling term is greater than the recorded constant
-  // dynamic power then use the difference (other portion already added to
-  // dynamic power). Else, all the constant dynamic power is accounted for, add
-  // nothing.
-  if (p->sys.scaling_coefficients[CONST_DYNAMICN] > cnst_dyn)
-    sample_cmp_pwr[CONST_DYNAMICP] =
-        (p->sys.scaling_coefficients[CONST_DYNAMICN] - cnst_dyn);
-
-  proc_power += sample_cmp_pwr[CONST_DYNAMICP];
-
-  double sum_pwr_cmp = 0;
-  for (unsigned i = 0; i < num_pwr_cmps; i++) {
-    sum_pwr_cmp += sample_cmp_pwr[i];
-  }
-  bool check = false;
-  check = sanity_check(sum_pwr_cmp, proc_power);
-  assert("Total Power does not equal the sum of the components\n" && (check));
-}
-
-void gpgpu_sim_wrapper::compute() { proc->compute(); }
-void gpgpu_sim_wrapper::print_power_kernel_stats(
-    double gpu_sim_cycle, double gpu_tot_sim_cycle, double init_value,
-    const std::string& kernel_info_string, bool print_trace) {
-  detect_print_steady_state(1, init_value);
-  if (g_power_simulation_enabled) {
-    powerfile << kernel_info_string << std::endl;
-
-    sanity_check((kernel_power.avg * kernel_sample_count), kernel_tot_power);
-    powerfile << "Kernel Average Power Data:" << std::endl;
-    powerfile << "kernel_avg_power = " << kernel_power.avg << std::endl;
-
-    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
-      powerfile << "gpu_avg_" << pwr_cmp_label[i] << " = "
-                << kernel_cmp_pwr[i].avg / kernel_sample_count << std::endl;
-    }
-    for (unsigned i = 0; i < num_perf_counters; ++i) {
-      powerfile << "gpu_avg_" << perf_count_label[i] << " = "
-                << kernel_cmp_perf_counters[i].avg / kernel_sample_count
-                << std::endl;
-    }
-
-    powerfile << std::endl << "Kernel Maximum Power Data:" << std::endl;
-    powerfile << "kernel_max_power = " << kernel_power.max << std::endl;
-    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
-      powerfile << "gpu_max_" << pwr_cmp_label[i] << " = "
-                << kernel_cmp_pwr[i].max << std::endl;
-    }
-    for (unsigned i = 0; i < num_perf_counters; ++i) {
-      powerfile << "gpu_max_" << perf_count_label[i] << " = "
-                << kernel_cmp_perf_counters[i].max << std::endl;
-    }
-
-    powerfile << std::endl << "Kernel Minimum Power Data:" << std::endl;
-    powerfile << "kernel_min_power = " << kernel_power.min << std::endl;
-    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
-      powerfile << "gpu_min_" << pwr_cmp_label[i] << " = "
-                << kernel_cmp_pwr[i].min << std::endl;
-    }
-    for (unsigned i = 0; i < num_perf_counters; ++i) {
-      powerfile << "gpu_min_" << perf_count_label[i] << " = "
-                << kernel_cmp_perf_counters[i].min << std::endl;
-    }
-
-    powerfile << std::endl
-              << "Accumulative Power Statistics Over Previous Kernels:"
-              << std::endl;
-    powerfile << "gpu_tot_avg_power = "
-              << gpu_tot_power.avg / total_sample_count << std::endl;
-    powerfile << "gpu_tot_max_power = " << gpu_tot_power.max << std::endl;
-    powerfile << "gpu_tot_min_power = " << gpu_tot_power.min << std::endl;
-    powerfile << std::endl << std::endl;
-    powerfile.flush();
-
-    if (print_trace) {
-      print_trace_files();
-    }
-  }
-}
-void gpgpu_sim_wrapper::dump() {
-  if (g_power_per_cycle_dump) proc->displayEnergy(2, 5);
-}
-
-void gpgpu_sim_wrapper::print_steady_state(int position, double init_val) {
-  double temp_avg = sample_val / (double)samples.size();
-  double temp_ipc = (init_val - init_inst_val) /
-                    (double)(samples.size() * gpu_stat_sample_freq);
-
-  if ((samples.size() >
-       gpu_steady_min_period)) {  // If steady state occurred for some time,
-                                  // print to file
-    has_written_avg = true;
-    gzprintf(steady_state_tacking_file, "%u,%d,%f,%f,", sample_start,
-             total_sample_count, temp_avg, temp_ipc);
-    for (unsigned i = 0; i < num_perf_counters; ++i) {
-      gzprintf(steady_state_tacking_file, "%f,",
-               samples_counter.at(i) / ((double)samples.size()));
-    }
-    gzprintf(steady_state_tacking_file, "\n");
-  } else {
-    if (!has_written_avg && position)
-      gzprintf(steady_state_tacking_file,
-               "ERROR! Not enough steady state points to generate average\n");
-  }
-
-  sample_start = 0;
-  sample_val = 0;
-  init_inst_val = init_val;
-  samples.clear();
-  samples_counter.clear();
-  pwr_counter.clear();
-  assert(samples.size() == 0);
-}
-
-void gpgpu_sim_wrapper::detect_print_steady_state(int position,
-                                                  double init_val) {
-  // Calculating Average
-  if (g_power_simulation_enabled && g_steady_power_levels_enabled) {
-    steady_state_tacking_file = gzopen(g_steady_state_tracking_filename, "a");
-    if (position == 0) {
-      if (samples.size() == 0) {
-        // First sample
-        sample_start = total_sample_count;
-        sample_val = proc->rt_power.readOp.dynamic;
-        init_inst_val = init_val;
-        samples.push_back(proc->rt_power.readOp.dynamic);
-        assert(samples_counter.size() == 0);
-        assert(pwr_counter.size() == 0);
-
-        for (unsigned i = 0; i < (num_perf_counters); ++i) {
-          samples_counter.push_back(sample_perf_counters[i]);
-        }
-
-        for (unsigned i = 0; i < (num_pwr_cmps); ++i) {
-          pwr_counter.push_back(sample_cmp_pwr[i]);
-        }
-        assert(pwr_counter.size() == (double)num_pwr_cmps);
-        assert(samples_counter.size() == (double)num_perf_counters);
-      } else {
-        // Get current average
-        double temp_avg = sample_val / (double)samples.size();
-
-        if (abs(proc->rt_power.readOp.dynamic - temp_avg) <
-            gpu_steady_power_deviation) {  // Value is within threshold
-          sample_val += proc->rt_power.readOp.dynamic;
-          samples.push_back(proc->rt_power.readOp.dynamic);
-          for (unsigned i = 0; i < (num_perf_counters); ++i) {
-            samples_counter.at(i) += sample_perf_counters[i];
-          }
-
-          for (unsigned i = 0; i < (num_pwr_cmps); ++i) {
-            pwr_counter.at(i) += sample_cmp_pwr[i];
-          }
-
-        } else {  // Value exceeds threshold, not considered steady state
-          print_steady_state(position, init_val);
-        }
-      }
-    } else {
-      print_steady_state(position, init_val);
-    }
-    gzclose(steady_state_tacking_file);
-  }
-}
-
-void gpgpu_sim_wrapper::open_files() {
-  if (g_power_simulation_enabled) {
-    if (g_power_trace_enabled) {
-      power_trace_file = gzopen(g_power_trace_filename, "a");
-      metric_trace_file = gzopen(g_metric_trace_filename, "a");
-    }
-  }
-}
-void gpgpu_sim_wrapper::close_files() {
-  if (g_power_simulation_enabled) {
-    if (g_power_trace_enabled) {
-      gzclose(power_trace_file);
-      gzclose(metric_trace_file);
-    }
-  }
-}
diff --git a/version b/version
index 1a1a990cd..09e18b115 100644
--- a/version
+++ b/version
@@ -1 +1 @@
-const char *g_gpgpusim_version_string = "GPGPU-Sim Simulator Version 4.0.0 ";
+const char *g_gpgpusim_version_string = "GPGPU-Sim Simulator Version 4.2.0 ";